From def5cb78d01ce6ef72e47471ca4d7322f7a2db56 Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Wed, 15 May 2019 08:21:18 +0000 Subject: [PATCH 01/19] [[DAGCombiner][NFC] Add a comment. As suggested in D61846. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360755 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 8fe6f721584b..27da26446ee4 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19830,6 +19830,8 @@ bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const { return false; } + // Try to prove that there is aliasing, or that there is no aliasing. Either + // way, we can return now. If nothing can be proved, proceed with more tests. bool IsAlias; if (BaseIndexOffset::computeAliasing(Op0, MUC0.NumBytes, Op1, MUC1.NumBytes, DAG, IsAlias)) From 75b2eadb06abbc29db94691089f4dbcb50ba1f7f Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 15 May 2019 10:05:49 +0000 Subject: [PATCH 02/19] [LV] Move getScalarizationOverhead and vector call cost computations to CM. (NFC) This reduces the number of parameters we need to pass in and they seem a natural fit in LoopVectorizationCostModel. Also simplifies things for D59995. As a follow up refactoring, we could only expose a expose a shouldUseVectorIntrinsic() helper in LoopVectorizationCostModel, instead of calling getVectorCallCost/getVectorIntrinsicCost in InnerLoopVectorizer/VPRecipeBuilder. Reviewers: Ayal, hsaito, dcaballe, rengolin Reviewed By: rengolin Differential Revision: https://reviews.llvm.org/D61638 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360758 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 116 +++++++++++---------- lib/Transforms/Vectorize/VPRecipeBuilder.h | 7 +- 2 files changed, 61 insertions(+), 62 deletions(-) diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 83f1c70f4cf2..ce03b854ffd5 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1169,6 +1169,18 @@ class LoopVectorizationCostModel { return foldTailByMasking() || Legal->blockNeedsPredication(BB); } + /// Estimate cost of an intrinsic call instruction CI if it were vectorized + /// with factor VF. Return the cost of the instruction, including + /// scalarization overhead if it's needed. + unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); + + /// Estimate cost of a call instruction CI if it were vectorized with factor + /// VF. Return the cost of the instruction, including scalarization overhead + /// if it's needed. The flag NeedToScalarize shows if the call needs to be + /// scalarized - + // i.e. either vector version isn't available, or is too expensive. + unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); + private: unsigned NumPredStores = 0; @@ -1221,6 +1233,10 @@ class LoopVectorizationCostModel { /// element) unsigned getUniformMemOpCost(Instruction *I, unsigned VF); + /// Estimate the overhead of scalarizing an instruction. This is a + /// convenience wrapper for the type-based getScalarizationOverhead API. + unsigned getScalarizationOverhead(Instruction *I, unsigned VF); + /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); @@ -3057,45 +3073,9 @@ static void cse(BasicBlock *BB) { } } -/// Estimate the overhead of scalarizing an instruction. This is a -/// convenience wrapper for the type-based getScalarizationOverhead API. -static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, - const TargetTransformInfo &TTI) { - if (VF == 1) - return 0; - - unsigned Cost = 0; - Type *RetTy = ToVectorTy(I->getType(), VF); - if (!RetTy->isVoidTy() && - (!isa(I) || - !TTI.supportsEfficientVectorElementLoadStore())) - Cost += TTI.getScalarizationOverhead(RetTy, true, false); - - // Some targets keep addresses scalar. - if (isa(I) && !TTI.prefersVectorizedAddressing()) - return Cost; - - if (CallInst *CI = dyn_cast(I)) { - SmallVector Operands(CI->arg_operands()); - Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); - } - else if (!isa(I) || - !TTI.supportsEfficientVectorElementLoadStore()) { - SmallVector Operands(I->operand_values()); - Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); - } - - return Cost; -} - -// Estimate cost of a call instruction CI if it were vectorized with factor VF. -// Return the cost of the instruction, including scalarization overhead if it's -// needed. The flag NeedToScalarize shows if the call needs to be scalarized - -// i.e. either vector version isn't available, or is too expensive. -static unsigned getVectorCallCost(CallInst *CI, unsigned VF, - const TargetTransformInfo &TTI, - const TargetLibraryInfo *TLI, - bool &NeedToScalarize) { +unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, + unsigned VF, + bool &NeedToScalarize) { Function *F = CI->getCalledFunction(); StringRef FnName = CI->getCalledFunction()->getName(); Type *ScalarRetTy = CI->getType(); @@ -3118,7 +3098,7 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF, // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI); + unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); unsigned Cost = ScalarCallCost * VF + ScalarizationCost; @@ -3137,12 +3117,8 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF, return Cost; } -// Estimate cost of an intrinsic call instruction CI if it were vectorized with -// factor VF. Return the cost of the instruction, including scalarization -// overhead if it's needed. -static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, - const TargetTransformInfo &TTI, - const TargetLibraryInfo *TLI) { +unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, + unsigned VF) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); @@ -4126,9 +4102,9 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize; - unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); bool UseVectorIntrinsic = - ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; assert((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."); @@ -5522,7 +5498,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. - Cost += getScalarizationOverhead(I, VF, TTI); + Cost += getScalarizationOverhead(I, VF); // If we have a predicated store, it may not be executed for each vector // lane. Scale the cost by the probability of executing the predicated @@ -5674,6 +5650,34 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { return VectorizationCostTy(C, TypeNotScalarized); } +unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, + unsigned VF) { + + if (VF == 1) + return 0; + + unsigned Cost = 0; + Type *RetTy = ToVectorTy(I->getType(), VF); + if (!RetTy->isVoidTy() && + (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) + Cost += TTI.getScalarizationOverhead(RetTy, true, false); + + // Some targets keep addresses scalar. + if (isa(I) && !TTI.prefersVectorizedAddressing()) + return Cost; + + if (CallInst *CI = dyn_cast(I)) { + SmallVector Operands(CI->arg_operands()); + Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); + } else if (!isa(I) || + !TTI.supportsEfficientVectorElementLoadStore()) { + SmallVector Operands(I->operand_values()); + Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); + } + + return Cost; +} + void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { if (VF == 1) return; @@ -5914,7 +5918,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // The cost of insertelement and extractelement instructions needed for // scalarization. - Cost += getScalarizationOverhead(I, VF, TTI); + Cost += getScalarizationOverhead(I, VF); // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally @@ -6035,16 +6039,16 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, case Instruction::Call: { bool NeedToScalarize; CallInst *CI = cast(I); - unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize); + unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); if (getVectorIntrinsicIDForCall(CI, TLI)) - return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI)); + return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); return CallCost; } default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + - getScalarizationOverhead(I, VF, TTI); + getScalarizationOverhead(I, VF); } // end of switch. } @@ -6638,9 +6642,9 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize; - unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); bool UseVectorIntrinsic = - ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; return UseVectorIntrinsic || !NeedToScalarize; } if (isa(I) || isa(I)) { @@ -6828,7 +6832,7 @@ LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); auto Plan = llvm::make_unique(VPBB); - VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, TTI, Legal, CM, Builder); + VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); // Represent values that will have defs inside VPlan. for (Value *V : NeedDef) Plan->addVPValue(V); diff --git a/lib/Transforms/Vectorize/VPRecipeBuilder.h b/lib/Transforms/Vectorize/VPRecipeBuilder.h index bc6b22120995..0ca6a6b93cfd 100644 --- a/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -29,9 +29,6 @@ class VPRecipeBuilder { /// Target Library Info. const TargetLibraryInfo *TLI; - /// Target Transform Info. - const TargetTransformInfo *TTI; - /// The legality analysis. LoopVectorizationLegality *Legal; @@ -104,11 +101,9 @@ class VPRecipeBuilder { public: VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, VPBuilder &Builder) - : OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), - Builder(Builder) {} + : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), Builder(Builder) {} /// Check if a recipe can be create for \p I withing the given VF \p Range. /// If a recipe can be created, it adds it to \p VPBB. From 4e5f939b0285b519314e1fa20866a90b949bc19b Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 15 May 2019 10:24:38 +0000 Subject: [PATCH 03/19] [NFC][InstCombine] Regenerate trunc.ll test git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360759 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/Transforms/InstCombine/trunc.ll | 112 ++++++++++++++------------- 1 file changed, 57 insertions(+), 55 deletions(-) diff --git a/test/Transforms/InstCombine/trunc.ll b/test/Transforms/InstCombine/trunc.ll index 01d53ab98405..bff2fc3770fe 100644 --- a/test/Transforms/InstCombine/trunc.ll +++ b/test/Transforms/InstCombine/trunc.ll @@ -8,8 +8,8 @@ declare void @use(i32) define i64 @test1(i64 %a) { ; CHECK-LABEL: @test1( -; CHECK-NEXT: [[B:%.*]] = trunc i64 %a to i32 -; CHECK-NEXT: [[C:%.*]] = and i64 %a, 15 +; CHECK-NEXT: [[B:%.*]] = trunc i64 [[A:%.*]] to i32 +; CHECK-NEXT: [[C:%.*]] = and i64 [[A]], 15 ; CHECK-NEXT: call void @use(i32 [[B]]) ; CHECK-NEXT: ret i64 [[C]] ; @@ -22,8 +22,8 @@ define i64 @test1(i64 %a) { define i64 @test2(i64 %a) { ; CHECK-LABEL: @test2( -; CHECK-NEXT: [[B:%.*]] = trunc i64 %a to i32 -; CHECK-NEXT: [[D1:%.*]] = shl i64 %a, 36 +; CHECK-NEXT: [[B:%.*]] = trunc i64 [[A:%.*]] to i32 +; CHECK-NEXT: [[D1:%.*]] = shl i64 [[A]], 36 ; CHECK-NEXT: [[D:%.*]] = ashr exact i64 [[D1]], 36 ; CHECK-NEXT: call void @use(i32 [[B]]) ; CHECK-NEXT: ret i64 [[D]] @@ -38,8 +38,8 @@ define i64 @test2(i64 %a) { define i64 @test3(i64 %a) { ; CHECK-LABEL: @test3( -; CHECK-NEXT: [[B:%.*]] = trunc i64 %a to i32 -; CHECK-NEXT: [[C:%.*]] = and i64 %a, 8 +; CHECK-NEXT: [[B:%.*]] = trunc i64 [[A:%.*]] to i32 +; CHECK-NEXT: [[C:%.*]] = and i64 [[A]], 8 ; CHECK-NEXT: call void @use(i32 [[B]]) ; CHECK-NEXT: ret i64 [[C]] ; @@ -52,8 +52,8 @@ define i64 @test3(i64 %a) { define i64 @test4(i64 %a) { ; CHECK-LABEL: @test4( -; CHECK-NEXT: [[B:%.*]] = trunc i64 %a to i32 -; CHECK-NEXT: [[C:%.*]] = and i64 %a, 8 +; CHECK-NEXT: [[B:%.*]] = trunc i64 [[A:%.*]] to i32 +; CHECK-NEXT: [[C:%.*]] = and i64 [[A]], 8 ; CHECK-NEXT: [[X:%.*]] = xor i64 [[C]], 8 ; CHECK-NEXT: call void @use(i32 [[B]]) ; CHECK-NEXT: ret i64 [[X]] @@ -68,8 +68,8 @@ define i64 @test4(i64 %a) { define i32 @test5(i32 %A) { ; CHECK-LABEL: @test5( -; CHECK-NEXT: [[C:%.*]] = lshr i32 %A, 16 -; CHECK-NEXT: ret i32 [[C]] +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[A:%.*]], 16 +; CHECK-NEXT: ret i32 [[TMP1]] ; %B = zext i32 %A to i128 %C = lshr i128 %B, 16 @@ -79,8 +79,8 @@ define i32 @test5(i32 %A) { define i32 @test6(i64 %A) { ; CHECK-LABEL: @test6( -; CHECK-NEXT: [[C:%.*]] = lshr i64 %A, 32 -; CHECK-NEXT: [[D:%.*]] = trunc i64 [[C]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[A:%.*]], 32 +; CHECK-NEXT: [[D:%.*]] = trunc i64 [[TMP1]] to i32 ; CHECK-NEXT: ret i32 [[D]] ; %B = zext i64 %A to i128 @@ -93,11 +93,12 @@ define i32 @test6(i64 %A) { ; but does contain sign bits, where the sign bit is not known to be zero. define i16 @ashr_mul_sign_bits(i8 %X, i8 %Y) { ; CHECK-LABEL: @ashr_mul_sign_bits( -; CHECK-NEXT: [[A:%.*]] = sext i8 %X to i16 -; CHECK-NEXT: [[B:%.*]] = sext i8 %Y to i16 +; CHECK-NEXT: [[A:%.*]] = sext i8 [[X:%.*]] to i16 +; CHECK-NEXT: [[B:%.*]] = sext i8 [[Y:%.*]] to i16 ; CHECK-NEXT: [[C:%.*]] = mul nsw i16 [[A]], [[B]] ; CHECK-NEXT: [[D:%.*]] = ashr i16 [[C]], 3 ; CHECK-NEXT: ret i16 [[D]] +; %A = sext i8 %X to i32 %B = sext i8 %Y to i32 %C = mul i32 %A, %B @@ -108,11 +109,12 @@ define i16 @ashr_mul_sign_bits(i8 %X, i8 %Y) { define i16 @ashr_mul(i8 %X, i8 %Y) { ; CHECK-LABEL: @ashr_mul( -; CHECK-NEXT: [[A:%.*]] = sext i8 %X to i16 -; CHECK-NEXT: [[B:%.*]] = sext i8 %Y to i16 +; CHECK-NEXT: [[A:%.*]] = sext i8 [[X:%.*]] to i16 +; CHECK-NEXT: [[B:%.*]] = sext i8 [[Y:%.*]] to i16 ; CHECK-NEXT: [[C:%.*]] = mul nsw i16 [[A]], [[B]] ; CHECK-NEXT: [[D:%.*]] = ashr i16 [[C]], 8 ; CHECK-NEXT: ret i16 [[D]] +; %A = sext i8 %X to i20 %B = sext i8 %Y to i20 %C = mul i20 %A, %B @@ -149,7 +151,7 @@ define <2 x i32> @trunc_ashr_vec(<2 x i32> %X) { define i92 @test7(i64 %A) { ; CHECK-LABEL: @test7( -; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 %A, 32 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[A:%.*]], 32 ; CHECK-NEXT: [[D:%.*]] = zext i64 [[TMP1]] to i92 ; CHECK-NEXT: ret i92 [[D]] ; @@ -161,8 +163,8 @@ define i92 @test7(i64 %A) { define i64 @test8(i32 %A, i32 %B) { ; CHECK-LABEL: @test8( -; CHECK-NEXT: [[TMP38:%.*]] = zext i32 %A to i64 -; CHECK-NEXT: [[TMP32:%.*]] = zext i32 %B to i64 +; CHECK-NEXT: [[TMP38:%.*]] = zext i32 [[A:%.*]] to i64 +; CHECK-NEXT: [[TMP32:%.*]] = zext i32 [[B:%.*]] to i64 ; CHECK-NEXT: [[TMP33:%.*]] = shl nuw i64 [[TMP32]], 32 ; CHECK-NEXT: [[INS35:%.*]] = or i64 [[TMP33]], [[TMP38]] ; CHECK-NEXT: ret i64 [[INS35]] @@ -177,7 +179,7 @@ define i64 @test8(i32 %A, i32 %B) { define i8 @test9(i32 %X) { ; CHECK-LABEL: @test9( -; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 %X to i8 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i8 ; CHECK-NEXT: [[Z:%.*]] = and i8 [[TMP1]], 42 ; CHECK-NEXT: ret i8 [[Z]] ; @@ -189,7 +191,7 @@ define i8 @test9(i32 %X) { ; rdar://8808586 define i8 @test10(i32 %X) { ; CHECK-LABEL: @test10( -; CHECK-NEXT: [[Y:%.*]] = trunc i32 %X to i8 +; CHECK-NEXT: [[Y:%.*]] = trunc i32 [[X:%.*]] to i8 ; CHECK-NEXT: [[Z:%.*]] = and i8 [[Y]], 42 ; CHECK-NEXT: ret i8 [[Z]] ; @@ -204,7 +206,7 @@ define i8 @test10(i32 %X) { define i32 @trunc_bitcast1(<4 x i32> %v) { ; CHECK-LABEL: @trunc_bitcast1( -; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x i32> %v, i32 1 +; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x i32> [[V:%.*]], i32 1 ; CHECK-NEXT: ret i32 [[EXT]] ; %bc = bitcast <4 x i32> %v to i128 @@ -217,7 +219,7 @@ define i32 @trunc_bitcast1(<4 x i32> %v) { define i32 @trunc_bitcast2(<2 x i64> %v) { ; CHECK-LABEL: @trunc_bitcast2( -; CHECK-NEXT: [[BC1:%.*]] = bitcast <2 x i64> %v to <4 x i32> +; CHECK-NEXT: [[BC1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> ; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x i32> [[BC1]], i32 2 ; CHECK-NEXT: ret i32 [[EXT]] ; @@ -231,7 +233,7 @@ define i32 @trunc_bitcast2(<2 x i64> %v) { define i32 @trunc_bitcast3(<4 x i32> %v) { ; CHECK-LABEL: @trunc_bitcast3( -; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x i32> %v, i32 0 +; CHECK-NEXT: [[EXT:%.*]] = extractelement <4 x i32> [[V:%.*]], i32 0 ; CHECK-NEXT: ret i32 [[EXT]] ; %bc = bitcast <4 x i32> %v to i128 @@ -241,7 +243,7 @@ define i32 @trunc_bitcast3(<4 x i32> %v) { define i32 @trunc_shl_31_i32_i64(i64 %val) { ; CHECK-LABEL: @trunc_shl_31_i32_i64( -; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i64 %val to i32 +; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i64 [[VAL:%.*]] to i32 ; CHECK-NEXT: [[TRUNC:%.*]] = shl i32 [[VAL_TR]], 31 ; CHECK-NEXT: ret i32 [[TRUNC]] ; @@ -252,7 +254,7 @@ define i32 @trunc_shl_31_i32_i64(i64 %val) { define i32 @trunc_shl_nsw_31_i32_i64(i64 %val) { ; CHECK-LABEL: @trunc_shl_nsw_31_i32_i64( -; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i64 %val to i32 +; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i64 [[VAL:%.*]] to i32 ; CHECK-NEXT: [[TRUNC:%.*]] = shl i32 [[VAL_TR]], 31 ; CHECK-NEXT: ret i32 [[TRUNC]] ; @@ -263,7 +265,7 @@ define i32 @trunc_shl_nsw_31_i32_i64(i64 %val) { define i32 @trunc_shl_nuw_31_i32_i64(i64 %val) { ; CHECK-LABEL: @trunc_shl_nuw_31_i32_i64( -; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i64 %val to i32 +; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i64 [[VAL:%.*]] to i32 ; CHECK-NEXT: [[TRUNC:%.*]] = shl i32 [[VAL_TR]], 31 ; CHECK-NEXT: ret i32 [[TRUNC]] ; @@ -274,7 +276,7 @@ define i32 @trunc_shl_nuw_31_i32_i64(i64 %val) { define i32 @trunc_shl_nsw_nuw_31_i32_i64(i64 %val) { ; CHECK-LABEL: @trunc_shl_nsw_nuw_31_i32_i64( -; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i64 %val to i32 +; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i64 [[VAL:%.*]] to i32 ; CHECK-NEXT: [[TRUNC:%.*]] = shl i32 [[VAL_TR]], 31 ; CHECK-NEXT: ret i32 [[TRUNC]] ; @@ -285,7 +287,7 @@ define i32 @trunc_shl_nsw_nuw_31_i32_i64(i64 %val) { define i16 @trunc_shl_15_i16_i64(i64 %val) { ; CHECK-LABEL: @trunc_shl_15_i16_i64( -; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i64 %val to i16 +; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i64 [[VAL:%.*]] to i16 ; CHECK-NEXT: [[TRUNC:%.*]] = shl i16 [[VAL_TR]], 15 ; CHECK-NEXT: ret i16 [[TRUNC]] ; @@ -296,7 +298,7 @@ define i16 @trunc_shl_15_i16_i64(i64 %val) { define i16 @trunc_shl_15_i16_i32(i32 %val) { ; CHECK-LABEL: @trunc_shl_15_i16_i32( -; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i32 %val to i16 +; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i32 [[VAL:%.*]] to i16 ; CHECK-NEXT: [[TRUNC:%.*]] = shl i16 [[VAL_TR]], 15 ; CHECK-NEXT: ret i16 [[TRUNC]] ; @@ -307,7 +309,7 @@ define i16 @trunc_shl_15_i16_i32(i32 %val) { define i8 @trunc_shl_7_i8_i64(i64 %val) { ; CHECK-LABEL: @trunc_shl_7_i8_i64( -; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i64 %val to i8 +; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i64 [[VAL:%.*]] to i8 ; CHECK-NEXT: [[TRUNC:%.*]] = shl i8 [[VAL_TR]], 7 ; CHECK-NEXT: ret i8 [[TRUNC]] ; @@ -318,7 +320,7 @@ define i8 @trunc_shl_7_i8_i64(i64 %val) { define i2 @trunc_shl_1_i2_i64(i64 %val) { ; CHECK-LABEL: @trunc_shl_1_i2_i64( -; CHECK-NEXT: [[SHL:%.*]] = shl i64 %val, 1 +; CHECK-NEXT: [[SHL:%.*]] = shl i64 [[VAL:%.*]], 1 ; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[SHL]] to i2 ; CHECK-NEXT: ret i2 [[TRUNC]] ; @@ -329,7 +331,7 @@ define i2 @trunc_shl_1_i2_i64(i64 %val) { define i32 @trunc_shl_1_i32_i64(i64 %val) { ; CHECK-LABEL: @trunc_shl_1_i32_i64( -; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i64 %val to i32 +; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i64 [[VAL:%.*]] to i32 ; CHECK-NEXT: [[TRUNC:%.*]] = shl i32 [[VAL_TR]], 1 ; CHECK-NEXT: ret i32 [[TRUNC]] ; @@ -340,7 +342,7 @@ define i32 @trunc_shl_1_i32_i64(i64 %val) { define i32 @trunc_shl_16_i32_i64(i64 %val) { ; CHECK-LABEL: @trunc_shl_16_i32_i64( -; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i64 %val to i32 +; CHECK-NEXT: [[VAL_TR:%.*]] = trunc i64 [[VAL:%.*]] to i32 ; CHECK-NEXT: [[TRUNC:%.*]] = shl i32 [[VAL_TR]], 16 ; CHECK-NEXT: ret i32 [[TRUNC]] ; @@ -370,7 +372,7 @@ define i32 @trunc_shl_32_i32_i64(i64 %val) { ; TODO: Should be able to handle vectors define <2 x i32> @trunc_shl_16_v2i32_v2i64(<2 x i64> %val) { ; CHECK-LABEL: @trunc_shl_16_v2i32_v2i64( -; CHECK-NEXT: [[SHL:%.*]] = shl <2 x i64> %val, +; CHECK-NEXT: [[SHL:%.*]] = shl <2 x i64> [[VAL:%.*]], ; CHECK-NEXT: [[TRUNC:%.*]] = trunc <2 x i64> [[SHL]] to <2 x i32> ; CHECK-NEXT: ret <2 x i32> [[TRUNC]] ; @@ -381,7 +383,7 @@ define <2 x i32> @trunc_shl_16_v2i32_v2i64(<2 x i64> %val) { define <2 x i32> @trunc_shl_nosplat_v2i32_v2i64(<2 x i64> %val) { ; CHECK-LABEL: @trunc_shl_nosplat_v2i32_v2i64( -; CHECK-NEXT: [[SHL:%.*]] = shl <2 x i64> %val, +; CHECK-NEXT: [[SHL:%.*]] = shl <2 x i64> [[VAL:%.*]], ; CHECK-NEXT: [[TRUNC:%.*]] = trunc <2 x i64> [[SHL]] to <2 x i32> ; CHECK-NEXT: ret <2 x i32> [[TRUNC]] ; @@ -392,10 +394,10 @@ define <2 x i32> @trunc_shl_nosplat_v2i32_v2i64(<2 x i64> %val) { define void @trunc_shl_31_i32_i64_multi_use(i64 %val, i32 addrspace(1)* %ptr0, i64 addrspace(1)* %ptr1) { ; CHECK-LABEL: @trunc_shl_31_i32_i64_multi_use( -; CHECK-NEXT: [[SHL:%.*]] = shl i64 %val, 31 +; CHECK-NEXT: [[SHL:%.*]] = shl i64 [[VAL:%.*]], 31 ; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[SHL]] to i32 -; CHECK-NEXT: store volatile i32 [[TRUNC]], i32 addrspace(1)* %ptr0, align 4 -; CHECK-NEXT: store volatile i64 [[SHL]], i64 addrspace(1)* %ptr1, align 8 +; CHECK-NEXT: store volatile i32 [[TRUNC]], i32 addrspace(1)* [[PTR0:%.*]], align 4 +; CHECK-NEXT: store volatile i64 [[SHL]], i64 addrspace(1)* [[PTR1:%.*]], align 8 ; CHECK-NEXT: ret void ; %shl = shl i64 %val, 31 @@ -407,7 +409,7 @@ define void @trunc_shl_31_i32_i64_multi_use(i64 %val, i32 addrspace(1)* %ptr0, i define i32 @trunc_shl_lshr_infloop(i64 %arg) { ; CHECK-LABEL: @trunc_shl_lshr_infloop( -; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 %arg, 1 +; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[ARG:%.*]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 ; CHECK-NEXT: ret i32 [[TMP2]] @@ -420,7 +422,7 @@ define i32 @trunc_shl_lshr_infloop(i64 %arg) { define i32 @trunc_shl_ashr_infloop(i64 %arg) { ; CHECK-LABEL: @trunc_shl_ashr_infloop( -; CHECK-NEXT: [[TMP0:%.*]] = ashr i64 %arg, 3 +; CHECK-NEXT: [[TMP0:%.*]] = ashr i64 [[ARG:%.*]], 3 ; CHECK-NEXT: [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 ; CHECK-NEXT: ret i32 [[TMP2]] @@ -433,7 +435,7 @@ define i32 @trunc_shl_ashr_infloop(i64 %arg) { define i32 @trunc_shl_shl_infloop(i64 %arg) { ; CHECK-LABEL: @trunc_shl_shl_infloop( -; CHECK-NEXT: [[ARG_TR:%.*]] = trunc i64 %arg to i32 +; CHECK-NEXT: [[ARG_TR:%.*]] = trunc i64 [[ARG:%.*]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[ARG_TR]], 3 ; CHECK-NEXT: ret i32 [[TMP2]] ; @@ -445,7 +447,7 @@ define i32 @trunc_shl_shl_infloop(i64 %arg) { define i32 @trunc_shl_lshr_var(i64 %arg, i64 %val) { ; CHECK-LABEL: @trunc_shl_lshr_var( -; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 %arg, %val +; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[ARG:%.*]], [[VAL:%.*]] ; CHECK-NEXT: [[TMP0_TR:%.*]] = trunc i64 [[TMP0]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP0_TR]], 2 ; CHECK-NEXT: ret i32 [[TMP2]] @@ -458,7 +460,7 @@ define i32 @trunc_shl_lshr_var(i64 %arg, i64 %val) { define i32 @trunc_shl_ashr_var(i64 %arg, i64 %val) { ; CHECK-LABEL: @trunc_shl_ashr_var( -; CHECK-NEXT: [[TMP0:%.*]] = ashr i64 %arg, %val +; CHECK-NEXT: [[TMP0:%.*]] = ashr i64 [[ARG:%.*]], [[VAL:%.*]] ; CHECK-NEXT: [[TMP0_TR:%.*]] = trunc i64 [[TMP0]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP0_TR]], 2 ; CHECK-NEXT: ret i32 [[TMP2]] @@ -471,7 +473,7 @@ define i32 @trunc_shl_ashr_var(i64 %arg, i64 %val) { define i32 @trunc_shl_shl_var(i64 %arg, i64 %val) { ; CHECK-LABEL: @trunc_shl_shl_var( -; CHECK-NEXT: [[TMP0:%.*]] = shl i64 %arg, %val +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[ARG:%.*]], [[VAL:%.*]] ; CHECK-NEXT: [[TMP0_TR:%.*]] = trunc i64 [[TMP0]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP0_TR]], 2 ; CHECK-NEXT: ret i32 [[TMP2]] @@ -484,7 +486,7 @@ define i32 @trunc_shl_shl_var(i64 %arg, i64 %val) { define <8 x i16> @trunc_shl_v8i15_v8i32_15(<8 x i32> %a) { ; CHECK-LABEL: @trunc_shl_v8i15_v8i32_15( -; CHECK-NEXT: [[SHL:%.*]] = shl <8 x i32> %a, +; CHECK-NEXT: [[SHL:%.*]] = shl <8 x i32> [[A:%.*]], ; CHECK-NEXT: [[CONV:%.*]] = trunc <8 x i32> [[SHL]] to <8 x i16> ; CHECK-NEXT: ret <8 x i16> [[CONV]] ; @@ -513,7 +515,7 @@ define <8 x i16> @trunc_shl_v8i16_v8i32_17(<8 x i32> %a) { define <8 x i16> @trunc_shl_v8i16_v8i32_4(<8 x i32> %a) { ; CHECK-LABEL: @trunc_shl_v8i16_v8i32_4( -; CHECK-NEXT: [[SHL:%.*]] = shl <8 x i32> %a, +; CHECK-NEXT: [[SHL:%.*]] = shl <8 x i32> [[A:%.*]], ; CHECK-NEXT: [[CONV:%.*]] = trunc <8 x i32> [[SHL]] to <8 x i16> ; CHECK-NEXT: ret <8 x i16> [[CONV]] ; @@ -527,7 +529,7 @@ define <8 x i16> @trunc_shl_v8i16_v8i32_4(<8 x i32> %a) { define <4 x i8> @wide_shuf(<4 x i32> %x) { ; CHECK-LABEL: @wide_shuf( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> %x, <4 x i32> , <4 x i32> +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> , <4 x i32> ; CHECK-NEXT: [[TRUNC:%.*]] = trunc <4 x i32> [[SHUF]] to <4 x i8> ; CHECK-NEXT: ret <4 x i8> [[TRUNC]] ; @@ -540,7 +542,7 @@ define <4 x i8> @wide_shuf(<4 x i32> %x) { define <4 x i8> @wide_splat1(<4 x i32> %x) { ; CHECK-LABEL: @wide_splat1( -; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i32> %x to <4 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i32> [[X:%.*]] to <4 x i8> ; CHECK-NEXT: [[TRUNC:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <4 x i32> ; CHECK-NEXT: ret <4 x i8> [[TRUNC]] ; @@ -554,7 +556,7 @@ define <4 x i8> @wide_splat1(<4 x i32> %x) { define <3 x i31> @wide_splat2(<3 x i33> %x) { ; CHECK-LABEL: @wide_splat2( -; CHECK-NEXT: [[TMP1:%.*]] = trunc <3 x i33> %x to <3 x i31> +; CHECK-NEXT: [[TMP1:%.*]] = trunc <3 x i33> [[X:%.*]] to <3 x i31> ; CHECK-NEXT: [[TRUNC:%.*]] = shufflevector <3 x i31> [[TMP1]], <3 x i31> undef, <3 x i32> ; CHECK-NEXT: ret <3 x i31> [[TRUNC]] ; @@ -569,7 +571,7 @@ define <3 x i31> @wide_splat2(<3 x i33> %x) { define <3 x i31> @wide_splat3(<3 x i33> %x) { ; CHECK-LABEL: @wide_splat3( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x i33> %x, <3 x i33> undef, <3 x i32> +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x i33> [[X:%.*]], <3 x i33> undef, <3 x i32> ; CHECK-NEXT: [[TRUNC:%.*]] = trunc <3 x i33> [[SHUF]] to <3 x i31> ; CHECK-NEXT: ret <3 x i31> [[TRUNC]] ; @@ -582,7 +584,7 @@ define <3 x i31> @wide_splat3(<3 x i33> %x) { define <8 x i8> @wide_lengthening_splat(<4 x i16> %v) { ; CHECK-LABEL: @wide_lengthening_splat( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TR:%.*]] = trunc <8 x i16> [[SHUF]] to <8 x i8> ; CHECK-NEXT: ret <8 x i8> [[TR]] ; @@ -593,7 +595,7 @@ define <8 x i8> @wide_lengthening_splat(<4 x i16> %v) { define <2 x i8> @narrow_add_vec_constant(<2 x i32> %x) { ; CHECK-LABEL: @narrow_add_vec_constant( -; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> %x to <2 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[X:%.*]] to <2 x i8> ; CHECK-NEXT: [[TR:%.*]] = add <2 x i8> [[TMP1]], ; CHECK-NEXT: ret <2 x i8> [[TR]] ; @@ -604,7 +606,7 @@ define <2 x i8> @narrow_add_vec_constant(<2 x i32> %x) { define <2 x i8> @narrow_mul_vec_constant(<2 x i32> %x) { ; CHECK-LABEL: @narrow_mul_vec_constant( -; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> %x to <2 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[X:%.*]] to <2 x i8> ; CHECK-NEXT: [[TR:%.*]] = mul <2 x i8> [[TMP1]], ; CHECK-NEXT: ret <2 x i8> [[TR]] ; @@ -615,7 +617,7 @@ define <2 x i8> @narrow_mul_vec_constant(<2 x i32> %x) { define <2 x i8> @narrow_sub_vec_constant(<2 x i32> %x) { ; CHECK-LABEL: @narrow_sub_vec_constant( -; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> %x to <2 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[X:%.*]] to <2 x i8> ; CHECK-NEXT: [[TR:%.*]] = sub <2 x i8> , [[TMP1]] ; CHECK-NEXT: ret <2 x i8> [[TR]] ; From d704922315b712322a0871a4b613b560069a6c12 Mon Sep 17 00:00:00 2001 From: Tim Northover Date: Wed, 15 May 2019 12:01:04 +0000 Subject: [PATCH 04/19] arm64_32: add some unittests that were in the wrong commit. Accidentally dropped them when committing the arm64_32 binutils support. There's no change to real code. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360763 91177308-0d34-0410-b5e6-96231b3b80d8 --- unittests/ADT/TripleTest.cpp | 11 +++++++++++ unittests/Support/TargetParserTest.cpp | 3 +++ 2 files changed, 14 insertions(+) diff --git a/unittests/ADT/TripleTest.cpp b/unittests/ADT/TripleTest.cpp index 9c8f3b1d43a9..b0f13d84af4e 100644 --- a/unittests/ADT/TripleTest.cpp +++ b/unittests/ADT/TripleTest.cpp @@ -552,6 +552,13 @@ TEST(TripleTest, ParsedIDs) { EXPECT_EQ(Triple::OpenEmbedded, T.getVendor()); EXPECT_EQ(Triple::Linux, T.getOS()); EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); + EXPECT_TRUE(T.isArch64Bit()); + + T = Triple("arm64_32-apple-ios"); + EXPECT_EQ(Triple::aarch64_32, T.getArch()); + EXPECT_EQ(Triple::IOS, T.getOS()); + EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); + EXPECT_TRUE(T.isArch32Bit()); T = Triple("huh"); EXPECT_EQ(Triple::UnknownArch, T.getArch()); @@ -1457,6 +1464,10 @@ TEST(TripleTest, ParseARMArch) { Triple T = Triple("arm64"); EXPECT_EQ(Triple::aarch64, T.getArch()); } + { + Triple T = Triple("arm64_32"); + EXPECT_EQ(Triple::aarch64_32, T.getArch()); + } { Triple T = Triple("aarch64"); EXPECT_EQ(Triple::aarch64, T.getArch()); diff --git a/unittests/Support/TargetParserTest.cpp b/unittests/Support/TargetParserTest.cpp index a973d980d265..08ed7645ea0e 100644 --- a/unittests/Support/TargetParserTest.cpp +++ b/unittests/Support/TargetParserTest.cpp @@ -658,12 +658,15 @@ TEST(TargetParserTest, ARMparseArchEndianAndISA) { } EXPECT_EQ(ARM::EndianKind::LITTLE, ARM::parseArchEndian("aarch64")); + EXPECT_EQ(ARM::EndianKind::LITTLE, ARM::parseArchEndian("arm64_32")); EXPECT_EQ(ARM::EndianKind::BIG, ARM::parseArchEndian("aarch64_be")); EXPECT_EQ(ARM::ISAKind::AARCH64, ARM::parseArchISA("aarch64")); EXPECT_EQ(ARM::ISAKind::AARCH64, ARM::parseArchISA("aarch64_be")); EXPECT_EQ(ARM::ISAKind::AARCH64, ARM::parseArchISA("arm64")); EXPECT_EQ(ARM::ISAKind::AARCH64, ARM::parseArchISA("arm64_be")); + EXPECT_EQ(ARM::ISAKind::AARCH64, ARM::parseArchISA("arm64_32")); + EXPECT_EQ(ARM::ISAKind::AARCH64, ARM::parseArchISA("aarch64_32")); } TEST(TargetParserTest, ARMparseArchProfile) { From 7e23545443eb8bb2acb0c40d3188976448ec2e12 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 15 May 2019 12:03:10 +0000 Subject: [PATCH 05/19] gn build: Run `git ls-files '*.gn' '*.gni' | xargs llvm/utils/gn/gn.py format` git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360764 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn | 8 ++++---- utils/gn/secondary/llvm/lib/Target/Lanai/BUILD.gn | 4 ++-- utils/gn/secondary/llvm/lib/Target/Sparc/BUILD.gn | 6 +++--- .../secondary/llvm/lib/Target/Sparc/MCTargetDesc/BUILD.gn | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn b/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn index 7c6d28025a80..1cfee43e7005 100644 --- a/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn +++ b/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn @@ -60,15 +60,15 @@ static_library("LLVMHexagonCodeGen") { "HexagonGenPredicate.cpp", "HexagonHardwareLoops.cpp", "HexagonHazardRecognizer.cpp", - "HexagonInstrInfo.cpp", "HexagonISelDAGToDAG.cpp", "HexagonISelDAGToDAGHVX.cpp", "HexagonISelLowering.cpp", "HexagonISelLoweringHVX.cpp", + "HexagonInstrInfo.cpp", "HexagonLoopIdiomRecognition.cpp", + "HexagonMCInstLower.cpp", "HexagonMachineFunctionInfo.cpp", "HexagonMachineScheduler.cpp", - "HexagonMCInstLower.cpp", "HexagonNewValueJump.cpp", "HexagonOptAddrMode.cpp", "HexagonOptimizeSZextends.cpp", @@ -83,10 +83,10 @@ static_library("LLVMHexagonCodeGen") { "HexagonTargetMachine.cpp", "HexagonTargetObjectFile.cpp", "HexagonTargetTransformInfo.cpp", - "HexagonVectorLoopCarriedReuse.cpp", - "HexagonVectorPrint.cpp", "HexagonVExtract.cpp", "HexagonVLIWPacketizer.cpp", + "HexagonVectorLoopCarriedReuse.cpp", + "HexagonVectorPrint.cpp", "RDFCopy.cpp", "RDFDeadCode.cpp", "RDFGraph.cpp", diff --git a/utils/gn/secondary/llvm/lib/Target/Lanai/BUILD.gn b/utils/gn/secondary/llvm/lib/Target/Lanai/BUILD.gn index 6600769cb745..85903af80f6a 100644 --- a/utils/gn/secondary/llvm/lib/Target/Lanai/BUILD.gn +++ b/utils/gn/secondary/llvm/lib/Target/Lanai/BUILD.gn @@ -33,11 +33,11 @@ static_library("LLVMLanaiCodeGen") { "LanaiAsmPrinter.cpp", "LanaiDelaySlotFiller.cpp", "LanaiFrameLowering.cpp", - "LanaiInstrInfo.cpp", "LanaiISelDAGToDAG.cpp", "LanaiISelLowering.cpp", - "LanaiMachineFunctionInfo.cpp", + "LanaiInstrInfo.cpp", "LanaiMCInstLower.cpp", + "LanaiMachineFunctionInfo.cpp", "LanaiMemAluCombiner.cpp", "LanaiRegisterInfo.cpp", "LanaiSelectionDAGInfo.cpp", diff --git a/utils/gn/secondary/llvm/lib/Target/Sparc/BUILD.gn b/utils/gn/secondary/llvm/lib/Target/Sparc/BUILD.gn index de51b1006b33..89242e0bae26 100644 --- a/utils/gn/secondary/llvm/lib/Target/Sparc/BUILD.gn +++ b/utils/gn/secondary/llvm/lib/Target/Sparc/BUILD.gn @@ -32,15 +32,15 @@ static_library("LLVMSparcCodeGen") { "DelaySlotFiller.cpp", "LeonPasses.cpp", "SparcAsmPrinter.cpp", - "SparcInstrInfo.cpp", + "SparcFrameLowering.cpp", "SparcISelDAGToDAG.cpp", "SparcISelLowering.cpp", - "SparcFrameLowering.cpp", + "SparcInstrInfo.cpp", + "SparcMCInstLower.cpp", "SparcMachineFunctionInfo.cpp", "SparcRegisterInfo.cpp", "SparcSubtarget.cpp", "SparcTargetMachine.cpp", - "SparcMCInstLower.cpp", "SparcTargetObjectFile.cpp", ] } diff --git a/utils/gn/secondary/llvm/lib/Target/Sparc/MCTargetDesc/BUILD.gn b/utils/gn/secondary/llvm/lib/Target/Sparc/MCTargetDesc/BUILD.gn index 27e5a778a6b5..15e9057fd0c1 100644 --- a/utils/gn/secondary/llvm/lib/Target/Sparc/MCTargetDesc/BUILD.gn +++ b/utils/gn/secondary/llvm/lib/Target/Sparc/MCTargetDesc/BUILD.gn @@ -63,8 +63,8 @@ static_library("MCTargetDesc") { "SparcInstPrinter.cpp", "SparcMCAsmInfo.cpp", "SparcMCCodeEmitter.cpp", - "SparcMCTargetDesc.cpp", "SparcMCExpr.cpp", + "SparcMCTargetDesc.cpp", "SparcTargetStreamer.cpp", ] } From 6e7af205f8e0d5ecd134ee4d624816287e9623a1 Mon Sep 17 00:00:00 2001 From: Simon Atanasyan Date: Wed, 15 May 2019 12:05:27 +0000 Subject: [PATCH 06/19] [mips] LLVM and GAS now use same instructions for CFA Definition. NFCI LLVM previously used `DW_CFA_def_cfa` instruction in .eh_frame to set the register and offset for current CFA rule. We change it to `DW_CFA_def_cfa_register` which is the same one used by GAS that only changes the register but keeping the old offset. Patch by Mirko Brkusanin. Differential Revision: https://reviews.llvm.org/D61899 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360765 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp | 2 +- test/MC/Mips/cfi-advance-loc.s | 4 ++-- test/MC/Mips/cfi-encoding.s | 6 +++--- test/MC/Mips/eh-frame.s | 6 ++---- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp index ac49a0eeb730..ddeec03ba784 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp @@ -85,7 +85,7 @@ static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI, MCAsmInfo *MAI = new MipsMCAsmInfo(TT); unsigned SP = MRI.getDwarfRegNum(Mips::SP, true); - MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, SP, 0); + MCCFIInstruction Inst = MCCFIInstruction::createDefCfaRegister(nullptr, SP); MAI->addInitialFrameState(Inst); return MAI; diff --git a/test/MC/Mips/cfi-advance-loc.s b/test/MC/Mips/cfi-advance-loc.s index 4f5666ab33cd..407ad3faffe3 100644 --- a/test/MC/Mips/cfi-advance-loc.s +++ b/test/MC/Mips/cfi-advance-loc.s @@ -37,7 +37,7 @@ g: // CHECK-LE-NEXT: EntrySize: 0 // CHECK-LE-NEXT: SectionData ( // CHECK-LE-NEXT: 0000: 10000000 00000000 017A5200 017C1F01 -// CHECK-LE-NEXT: 0010: 0B0C1D00 14000000 18000000 00000000 +// CHECK-LE-NEXT: 0010: 0B0D1D00 14000000 18000000 00000000 // CHECK-LE-NEXT: 0020: 04010000 00030001 0E080000 14000000 // CHECK-LE-NEXT: 0030: 30000000 04010000 04000100 00040000 // CHECK-LE-NEXT: 0040: 01000E08 @@ -60,7 +60,7 @@ g: // CHECK-BE-NEXT: EntrySize: 0 // CHECK-BE-NEXT: SectionData ( // CHECK-BE-NEXT: 0000: 00000010 00000000 017A5200 017C1F01 -// CHECK-BE-NEXT: 0010: 0B0C1D00 00000014 00000018 00000000 +// CHECK-BE-NEXT: 0010: 0B0D1D00 00000014 00000018 00000000 // CHECK-BE-NEXT: 0020: 00000104 00030100 0E080000 00000014 // CHECK-BE-NEXT: 0030: 00000030 00000104 00010004 00040001 // CHECK-BE-NEXT: 0040: 00000E08 diff --git a/test/MC/Mips/cfi-encoding.s b/test/MC/Mips/cfi-encoding.s index fe0980768307..98a5ad1afe83 100644 --- a/test/MC/Mips/cfi-encoding.s +++ b/test/MC/Mips/cfi-encoding.s @@ -6,15 +6,15 @@ # RUN: | llvm-objdump -s -section=.eh_frame - | FileCheck --check-prefix=N64 %s # O32: 0000 00000010 00000000 017a5200 017c1f01 -# O32: 0010 0b0c1d00 00000010 00000018 00000000 +# O32: 0010 0b0d1d00 00000010 00000018 00000000 # O32: 0020 00000004 00000000 # N32: 0000 00000010 00000000 017a5200 017c1f01 -# N32: 0010 0b0c1d00 00000010 00000018 00000000 +# N32: 0010 0b0d1d00 00000010 00000018 00000000 # N32: 0020 00000004 00000000 # N64: 0000 00000010 00000000 017a5200 01781f01 -# N64: 0010 0c0c1d00 00000018 00000018 00000000 +# N64: 0010 0c0d1d00 00000018 00000018 00000000 # N64: 0020 00000000 00000000 00000004 00000000 foo: diff --git a/test/MC/Mips/eh-frame.s b/test/MC/Mips/eh-frame.s index e03027a20a4f..e901f44196d8 100644 --- a/test/MC/Mips/eh-frame.s +++ b/test/MC/Mips/eh-frame.s @@ -31,8 +31,7 @@ func: // DWARF32: Return address column: 31 // DWARF32: Augmentation data: 0B // ^^ fde pointer encoding: DW_EH_PE_sdata4 -// DWARF32: DW_CFA_def_cfa: reg29 +0 -// FIXME: The instructions are different from the ones produces by gas. +// DWARF32: DW_CFA_def_cfa_register: reg29 // // DWARF32: 00000014 00000010 00000018 FDE cie=00000018 pc=00000000...00000000 // DWARF32: DW_CFA_nop: @@ -49,8 +48,7 @@ func: // DWARF64: Return address column: 31 // DWARF64: Augmentation data: 0C // ^^ fde pointer encoding: DW_EH_PE_sdata8 -// DWARF64: DW_CFA_def_cfa: reg29 +0 -// FIXME: The instructions are different from the ones produces by gas. +// DWARF64: DW_CFA_def_cfa_register: reg29 // // DWARF64: 00000014 00000018 00000018 FDE cie=00000018 pc=00000000...00000000 // DWARF64: DW_CFA_nop: From 5a5e3cc42ff547081062174ca80e1908064f61eb Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 15 May 2019 12:08:45 +0000 Subject: [PATCH 07/19] gn build: Merge r360671 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360766 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn b/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn index fb9b2ab82d05..db550dbe790e 100644 --- a/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn +++ b/utils/gn/secondary/llvm/unittests/Analysis/BUILD.gn @@ -20,6 +20,7 @@ unittest("AnalysisTests") { "DivergenceAnalysisTest.cpp", "DomTreeUpdaterTest.cpp", "GlobalsModRefTest.cpp", + "IVDescriptorsTest.cpp", "LazyCallGraphTest.cpp", "LoopInfoTest.cpp", "MemoryBuiltinsTest.cpp", From 14e3c5404092f7ec9775c16082f02617bcf02029 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 15 May 2019 12:41:58 +0000 Subject: [PATCH 08/19] [ARM] Cortex-M4 schedule This patch adds a simple Cortex-M4 schedule, renaming the existing M3 schedule to M4 and filling in the latencies as-per the Cortex-M4 TRM: https://developer.arm.com/docs/ddi0439/latest Most of these are 1, with the important exception being loads taking 2 cycles. A few others are also higher, but I don't believe they make a large difference. I've repurposed the M3 schedule as the latencies are mostly the same between the two cores, with the M4 having more FP and DSP instructions. We also turn on MISched and UseAA for the cores that now use this. It also adds some schedule Write's to various instruction to make things simpler. Differential Revision: https://reviews.llvm.org/D54142 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360768 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARM.td | 20 ++- lib/Target/ARM/ARMInstrThumb.td | 28 ++--- lib/Target/ARM/ARMInstrThumb2.td | 52 ++++---- lib/Target/ARM/ARMSchedule.td | 2 +- lib/Target/ARM/ARMScheduleM3.td | 20 --- lib/Target/ARM/ARMScheduleM4.td | 119 ++++++++++++++++++ .../ARM/ParallelDSP/multi-use-loads.ll | 30 ++--- test/CodeGen/ARM/aapcs-hfa-code.ll | 4 +- test/CodeGen/ARM/useaa.ll | 2 + .../Thumb2/ifcvt-no-branch-predictor.ll | 6 +- test/CodeGen/Thumb2/m4-sched-ldr.mir | 60 +++++++++ test/CodeGen/Thumb2/m4-sched-regs.ll | 52 ++++++++ 12 files changed, 310 insertions(+), 85 deletions(-) delete mode 100644 lib/Target/ARM/ARMScheduleM3.td create mode 100644 lib/Target/ARM/ARMScheduleM4.td create mode 100644 test/CodeGen/Thumb2/m4-sched-ldr.mir create mode 100644 test/CodeGen/Thumb2/m4-sched-regs.ll diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 252b98d1995f..48eba2246c57 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -978,21 +978,27 @@ def : ProcessorModel<"cortex-r8", CortexA8Model, [ARMv7r, FeatureHasSlowFPVMLx, FeatureAvoidPartialCPSR]>; -def : ProcessorModel<"cortex-m3", CortexM3Model, [ARMv7m, +def : ProcessorModel<"cortex-m3", CortexM4Model, [ARMv7m, ProcM3, FeaturePrefLoopAlign32, + FeatureUseMISched, + FeatureUseAA, FeatureHasNoBranchPredictor]>; -def : ProcessorModel<"sc300", CortexM3Model, [ARMv7m, +def : ProcessorModel<"sc300", CortexM4Model, [ARMv7m, ProcM3, + FeatureUseMISched, + FeatureUseAA, FeatureHasNoBranchPredictor]>; -def : ProcessorModel<"cortex-m4", CortexM3Model, [ARMv7em, +def : ProcessorModel<"cortex-m4", CortexM4Model, [ARMv7em, FeatureVFP4, FeatureVFPOnlySP, FeatureD16, FeaturePrefLoopAlign32, FeatureHasSlowFPVMLx, + FeatureUseMISched, + FeatureUseAA, FeatureHasNoBranchPredictor]>; def : ProcNoItin<"cortex-m7", [ARMv7em, @@ -1002,22 +1008,26 @@ def : ProcNoItin<"cortex-m7", [ARMv7em, def : ProcNoItin<"cortex-m23", [ARMv8mBaseline, FeatureNoMovt]>; -def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline, +def : ProcessorModel<"cortex-m33", CortexM4Model, [ARMv8mMainline, FeatureDSP, FeatureFPARMv8, FeatureD16, FeatureVFPOnlySP, FeaturePrefLoopAlign32, FeatureHasSlowFPVMLx, + FeatureUseMISched, + FeatureUseAA, FeatureHasNoBranchPredictor]>; -def : ProcessorModel<"cortex-m35p", CortexM3Model, [ARMv8mMainline, +def : ProcessorModel<"cortex-m35p", CortexM4Model, [ARMv8mMainline, FeatureDSP, FeatureFPARMv8, FeatureD16, FeatureVFPOnlySP, FeaturePrefLoopAlign32, FeatureHasSlowFPVMLx, + FeatureUseMISched, + FeatureUseAA, FeatureHasNoBranchPredictor]>; diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index 8337aaadb9b1..cfeb13c6acb6 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -663,7 +663,7 @@ let canFoldAsLoad = 1, isReMaterializable = 1, AddedComplexity = 10 in def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i, "ldr", "\t$Rt, $addr", [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>, - T1Encoding<{0,1,0,0,1,?}> { + T1Encoding<{0,1,0,0,1,?}>, Sched<[WriteLd]> { // A6.2 & A8.6.59 bits<3> Rt; bits<8> addr; @@ -677,7 +677,7 @@ let canFoldAsLoad = 1 in def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i, "ldr", "\t$Rt, $addr", [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>, - T1LdStSP<{1,?,?}> { + T1LdStSP<{1,?,?}>, Sched<[WriteLd]> { bits<3> Rt; bits<8> addr; let Inst{10-8} = Rt; @@ -728,39 +728,39 @@ multiclass thumb_st_rr_ri_enc reg_opc, bits<4> imm_opc, defm tLDR : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rr, t_addrmode_is4, AddrModeT1_4, IIC_iLoad_r, IIC_iLoad_i, "ldr", - load>; + load>, Sched<[WriteLd]>; // A8.6.64 & A8.6.61 defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rr, t_addrmode_is1, AddrModeT1_1, IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb", - zextloadi8>; + zextloadi8>, Sched<[WriteLd]>; // A8.6.76 & A8.6.73 defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rr, t_addrmode_is2, AddrModeT1_2, IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh", - zextloadi16>; + zextloadi16>, Sched<[WriteLd]>; let AddedComplexity = 10 in def tLDRSB : // A8.6.80 T1pILdStEncode<0b011, (outs tGPR:$Rt), (ins t_addrmode_rr_sext:$addr), AddrModeT1_1, IIC_iLoad_bh_r, "ldrsb", "\t$Rt, $addr", - [(set tGPR:$Rt, (sextloadi8 t_addrmode_rr_sext:$addr))]>; + [(set tGPR:$Rt, (sextloadi8 t_addrmode_rr_sext:$addr))]>, Sched<[WriteLd]>; let AddedComplexity = 10 in def tLDRSH : // A8.6.84 T1pILdStEncode<0b111, (outs tGPR:$Rt), (ins t_addrmode_rr_sext:$addr), AddrModeT1_2, IIC_iLoad_bh_r, "ldrsh", "\t$Rt, $addr", - [(set tGPR:$Rt, (sextloadi16 t_addrmode_rr_sext:$addr))]>; + [(set tGPR:$Rt, (sextloadi16 t_addrmode_rr_sext:$addr))]>, Sched<[WriteLd]>; def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i, "str", "\t$Rt, $addr", [(store tGPR:$Rt, t_addrmode_sp:$addr)]>, - T1LdStSP<{0,?,?}> { + T1LdStSP<{0,?,?}>, Sched<[WriteST]> { bits<3> Rt; bits<8> addr; let Inst{10-8} = Rt; @@ -771,19 +771,19 @@ def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i, defm tSTR : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rr, t_addrmode_is4, AddrModeT1_4, IIC_iStore_r, IIC_iStore_i, "str", - store>; + store>, Sched<[WriteST]>; // A8.6.197 & A8.6.195 defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rr, t_addrmode_is1, AddrModeT1_1, IIC_iStore_bh_r, IIC_iStore_bh_i, "strb", - truncstorei8>; + truncstorei8>, Sched<[WriteST]>; // A8.6.207 & A8.6.205 defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr, t_addrmode_is2, AddrModeT1_2, IIC_iStore_bh_r, IIC_iStore_bh_i, "strh", - truncstorei16>; + truncstorei16>, Sched<[WriteST]>; //===----------------------------------------------------------------------===// @@ -843,7 +843,7 @@ let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1, def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops), IIC_iPop, "pop${p}\t$regs", []>, - T1Misc<{1,1,0,?,?,?,?}> { + T1Misc<{1,1,0,?,?,?,?}>, Sched<[WriteLd]> { bits<16> regs; let Inst{8} = regs{15}; let Inst{7-0} = regs{7-0}; @@ -853,7 +853,7 @@ let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in def tPUSH : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops), IIC_iStore_m, "push${p}\t$regs", []>, - T1Misc<{0,1,0,?,?,?,?}> { + T1Misc<{0,1,0,?,?,?,?}>, Sched<[WriteST]> { bits<16> regs; let Inst{8} = regs{14}; let Inst{7-0} = regs{7-0}; @@ -1214,7 +1214,7 @@ def tMUL : // A8.6.105 T1 Thumb1sI<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), AddrModeNone, 2, IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", "$Rm = $Rd", [(set tGPR:$Rd, (mul tGPR:$Rn, tGPR:$Rm))]>, - T1DataProcessing<0b1101> { + T1DataProcessing<0b1101>, Sched<[WriteMUL32, ReadMUL, ReadMUL]> { bits<3> Rd; bits<3> Rn; let Inst{5-3} = Rn; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 2909d03cca0e..234b2767494d 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -1333,7 +1333,8 @@ def t2LDRB_PRE : T2Ipreldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb), (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset), AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu, - "ldrb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>; + "ldrb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>, + Sched<[WriteLd]>; def t2LDRH_PRE : T2Ipreldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb), (ins t2addrmode_imm8_pre:$addr), @@ -2331,14 +2332,14 @@ class T2SatI def t2SSAT: T2SatI<(ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh), "ssat", "\t$Rd, $sat_imm, $Rn$sh">, - Requires<[IsThumb2]> { + Requires<[IsThumb2]>, Sched<[WriteALU]> { let Inst{23-22} = 0b00; let Inst{5} = 0; } def t2SSAT16: T2SatI<(ins imm1_16:$sat_imm, rGPR:$Rn), "ssat16", "\t$Rd, $sat_imm, $Rn">, - Requires<[IsThumb2, HasDSP]> { + Requires<[IsThumb2, HasDSP]>, Sched<[WriteALU]> { let Inst{23-22} = 0b00; let sh = 0b100000; let Inst{4} = 0; @@ -2346,13 +2347,13 @@ def t2SSAT16: T2SatI<(ins imm1_16:$sat_imm, rGPR:$Rn), def t2USAT: T2SatI<(ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh), "usat", "\t$Rd, $sat_imm, $Rn$sh">, - Requires<[IsThumb2]> { + Requires<[IsThumb2]>, Sched<[WriteALU]> { let Inst{23-22} = 0b10; } def t2USAT16: T2SatI<(ins imm0_15:$sat_imm, rGPR:$Rn), "usat16", "\t$Rd, $sat_imm, $Rn">, - Requires<[IsThumb2, HasDSP]> { + Requires<[IsThumb2, HasDSP]>, Sched<[WriteALU]> { let Inst{23-22} = 0b10; let sh = 0b100000; let Inst{4} = 0; @@ -2476,7 +2477,7 @@ class T2TwoRegBitFI { + [(set rGPR:$Rd, (and rGPR:$src, bf_inv_mask_imm:$imm))]>, Sched<[WriteALU]> { let Inst{31-27} = 0b11110; let Inst{26} = 0; // should be 0. let Inst{25} = 1; @@ -2492,7 +2493,7 @@ def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm), def t2SBFX: T2TwoRegBitFI< (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb), - IIC_iUNAsi, "sbfx", "\t$Rd, $Rn, $lsb, $msb", []> { + IIC_iUNAsi, "sbfx", "\t$Rd, $Rn, $lsb, $msb", []>, Sched<[WriteALU]> { let Inst{31-27} = 0b11110; let Inst{25} = 1; let Inst{24-20} = 0b10100; @@ -2501,7 +2502,7 @@ def t2SBFX: T2TwoRegBitFI< def t2UBFX: T2TwoRegBitFI< (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb), - IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $msb", []> { + IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $msb", []>, Sched<[WriteALU]> { let Inst{31-27} = 0b11110; let Inst{25} = 1; let Inst{24-20} = 0b11100; @@ -2527,7 +2528,7 @@ let Constraints = "$src = $Rd" in { (ins rGPR:$src, rGPR:$Rn, bf_inv_mask_imm:$imm), IIC_iBITi, "bfi", "\t$Rd, $Rn, $imm", [(set rGPR:$Rd, (ARMbfi rGPR:$src, rGPR:$Rn, - bf_inv_mask_imm:$imm))]> { + bf_inv_mask_imm:$imm))]>, Sched<[WriteALU]> { let Inst{31-27} = 0b11110; let Inst{26} = 0; // should be 0. let Inst{25} = 1; @@ -3281,17 +3282,17 @@ def t2LDREXB : T2I_ldrex<0b0100, (outs rGPR:$Rt), (ins addr_offset_none:$addr), AddrModeNone, 4, NoItinerary, "ldrexb", "\t$Rt, $addr", "", [(set rGPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>, - Requires<[IsThumb, HasV8MBaseline]>; + Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteLd]>; def t2LDREXH : T2I_ldrex<0b0101, (outs rGPR:$Rt), (ins addr_offset_none:$addr), AddrModeNone, 4, NoItinerary, "ldrexh", "\t$Rt, $addr", "", [(set rGPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>, - Requires<[IsThumb, HasV8MBaseline]>; + Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteLd]>; def t2LDREX : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr), AddrModeT2_ldrex, 4, NoItinerary, "ldrex", "\t$Rt, $addr", "", [(set rGPR:$Rt, (ldrex_4 t2addrmode_imm0_1020s4:$addr))]>, - Requires<[IsThumb, HasV8MBaseline]> { + Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteLd]> { bits<4> Rt; bits<12> addr; let Inst{31-27} = 0b11101; @@ -3307,7 +3308,7 @@ def t2LDREXD : T2I_ldrex<0b0111, (outs rGPR:$Rt, rGPR:$Rt2), AddrModeNone, 4, NoItinerary, "ldrexd", "\t$Rt, $Rt2, $addr", "", [], {?, ?, ?, ?}>, - Requires<[IsThumb2, IsNotMClass]> { + Requires<[IsThumb2, IsNotMClass]>, Sched<[WriteLd]> { bits<4> Rt2; let Inst{11-8} = Rt2; } @@ -3315,17 +3316,17 @@ def t2LDAEXB : T2I_ldrex<0b1100, (outs rGPR:$Rt), (ins addr_offset_none:$addr), AddrModeNone, 4, NoItinerary, "ldaexb", "\t$Rt, $addr", "", [(set rGPR:$Rt, (ldaex_1 addr_offset_none:$addr))]>, - Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>; + Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>, Sched<[WriteLd]>; def t2LDAEXH : T2I_ldrex<0b1101, (outs rGPR:$Rt), (ins addr_offset_none:$addr), AddrModeNone, 4, NoItinerary, "ldaexh", "\t$Rt, $addr", "", [(set rGPR:$Rt, (ldaex_2 addr_offset_none:$addr))]>, - Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>; + Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>, Sched<[WriteLd]>; def t2LDAEX : Thumb2I<(outs rGPR:$Rt), (ins addr_offset_none:$addr), AddrModeNone, 4, NoItinerary, "ldaex", "\t$Rt, $addr", "", [(set rGPR:$Rt, (ldaex_4 addr_offset_none:$addr))]>, - Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]> { + Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>, Sched<[WriteLd]> { bits<4> Rt; bits<4> addr; let Inst{31-27} = 0b11101; @@ -3341,7 +3342,7 @@ def t2LDAEXD : T2I_ldrex<0b1111, (outs rGPR:$Rt, rGPR:$Rt2), AddrModeNone, 4, NoItinerary, "ldaexd", "\t$Rt, $Rt2, $addr", "", [], {?, ?, ?, ?}>, Requires<[IsThumb, - HasAcquireRelease, HasV7Clrex, IsNotMClass]> { + HasAcquireRelease, HasV7Clrex, IsNotMClass]>, Sched<[WriteLd]> { bits<4> Rt2; let Inst{11-8} = Rt2; @@ -3356,14 +3357,14 @@ def t2STREXB : T2I_strex<0b0100, (outs rGPR:$Rd), "strexb", "\t$Rd, $Rt, $addr", "", [(set rGPR:$Rd, (strex_1 rGPR:$Rt, addr_offset_none:$addr))]>, - Requires<[IsThumb, HasV8MBaseline]>; + Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteST]>; def t2STREXH : T2I_strex<0b0101, (outs rGPR:$Rd), (ins rGPR:$Rt, addr_offset_none:$addr), AddrModeNone, 4, NoItinerary, "strexh", "\t$Rd, $Rt, $addr", "", [(set rGPR:$Rd, (strex_2 rGPR:$Rt, addr_offset_none:$addr))]>, - Requires<[IsThumb, HasV8MBaseline]>; + Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteST]>; def t2STREX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_imm0_1020s4:$addr), @@ -3371,7 +3372,7 @@ def t2STREX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, "strex", "\t$Rd, $Rt, $addr", "", [(set rGPR:$Rd, (strex_4 rGPR:$Rt, t2addrmode_imm0_1020s4:$addr))]>, - Requires<[IsThumb, HasV8MBaseline]> { + Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteST]> { bits<4> Rd; bits<4> Rt; bits<12> addr; @@ -3388,7 +3389,7 @@ def t2STREXD : T2I_strex<0b0111, (outs rGPR:$Rd), AddrModeNone, 4, NoItinerary, "strexd", "\t$Rd, $Rt, $Rt2, $addr", "", [], {?, ?, ?, ?}>, - Requires<[IsThumb2, IsNotMClass]> { + Requires<[IsThumb2, IsNotMClass]>, Sched<[WriteST]> { bits<4> Rt2; let Inst{11-8} = Rt2; } @@ -3399,7 +3400,7 @@ def t2STLEXB : T2I_strex<0b1100, (outs rGPR:$Rd), [(set rGPR:$Rd, (stlex_1 rGPR:$Rt, addr_offset_none:$addr))]>, Requires<[IsThumb, HasAcquireRelease, - HasV7Clrex]>; + HasV7Clrex]>, Sched<[WriteST]>; def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd), (ins rGPR:$Rt, addr_offset_none:$addr), @@ -3408,7 +3409,7 @@ def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd), [(set rGPR:$Rd, (stlex_2 rGPR:$Rt, addr_offset_none:$addr))]>, Requires<[IsThumb, HasAcquireRelease, - HasV7Clrex]>; + HasV7Clrex]>, Sched<[WriteST]>; def t2STLEX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, addr_offset_none:$addr), @@ -3416,7 +3417,8 @@ def t2STLEX : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, "stlex", "\t$Rd, $Rt, $addr", "", [(set rGPR:$Rd, (stlex_4 rGPR:$Rt, addr_offset_none:$addr))]>, - Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]> { + Requires<[IsThumb, HasAcquireRelease, HasV7Clrex]>, + Sched<[WriteST]> { bits<4> Rd; bits<4> Rt; bits<4> addr; @@ -3433,7 +3435,7 @@ def t2STLEXD : T2I_strex<0b1111, (outs rGPR:$Rd), AddrModeNone, 4, NoItinerary, "stlexd", "\t$Rd, $Rt, $Rt2, $addr", "", [], {?, ?, ?, ?}>, Requires<[IsThumb, HasAcquireRelease, - HasV7Clrex, IsNotMClass]> { + HasV7Clrex, IsNotMClass]>, Sched<[WriteST]> { bits<4> Rt2; let Inst{11-8} = Rt2; } diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index 4e848406694b..ce74d325c4e5 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -424,4 +424,4 @@ include "ARMScheduleA9.td" include "ARMScheduleSwift.td" include "ARMScheduleR52.td" include "ARMScheduleA57.td" -include "ARMScheduleM3.td" +include "ARMScheduleM4.td" diff --git a/lib/Target/ARM/ARMScheduleM3.td b/lib/Target/ARM/ARMScheduleM3.td deleted file mode 100644 index 325e28977ee1..000000000000 --- a/lib/Target/ARM/ARMScheduleM3.td +++ /dev/null @@ -1,20 +0,0 @@ -//=- ARMScheduleM3.td - ARM Cortex-M3 Scheduling Definitions -*- tablegen -*-=// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file defines the machine model for the ARM Cortex-M3 processor. -// -//===----------------------------------------------------------------------===// - -def CortexM3Model : SchedMachineModel { - let IssueWidth = 1; // Only IT can be dual-issued, so assume single-issue - let MicroOpBufferSize = 0; // In-order - let LoadLatency = 2; // Latency when not pipelined, not pc-relative - let MispredictPenalty = 2; // Best case branch taken cost - - let CompleteModel = 0; -} diff --git a/lib/Target/ARM/ARMScheduleM4.td b/lib/Target/ARM/ARMScheduleM4.td new file mode 100644 index 000000000000..38c8ea2b4f35 --- /dev/null +++ b/lib/Target/ARM/ARMScheduleM4.td @@ -0,0 +1,119 @@ +//==- ARMScheduleM4.td - Cortex-M4 Scheduling Definitions -*- tablegen -*-====// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the SchedRead/Write data for the ARM Cortex-M4 processor. +// +//===----------------------------------------------------------------------===// + +def CortexM4Model : SchedMachineModel { + let IssueWidth = 1; // Only IT can be dual-issued, so assume single-issue + let MicroOpBufferSize = 0; // In-order + let LoadLatency = 2; // Latency when not pipelined, not pc-relative + let MispredictPenalty = 2; // Best case branch taken cost + let PostRAScheduler = 1; + + let CompleteModel = 0; +} + + +// We model the entire cpu as a single pipeline with a BufferSize = 0 since +// Cortex-M4 is in-order. + +def M4Unit : ProcResource<1> { let BufferSize = 0; } + + +let SchedModel = CortexM4Model in { + +// Some definitions of latencies we apply to different instructions + +class M4UnitL1 : WriteRes { let Latency = 1; } +class M4UnitL2 : WriteRes { let Latency = 2; } +class M4UnitL3 : WriteRes { let Latency = 3; } +class M4UnitL14 : WriteRes { let Latency = 14; } +def M4UnitL1_wr : SchedWriteRes<[M4Unit]> { let Latency = 1; } +def M4UnitL2_wr : SchedWriteRes<[M4Unit]> { let Latency = 2; } +class M4UnitL1I : InstRW<[M4UnitL1_wr], instr>; +class M4UnitL2I : InstRW<[M4UnitL2_wr], instr>; + + +// Loads, MAC's and DIV all get a higher latency of 2 +def : M4UnitL2; +def : M4UnitL2; +def : M4UnitL2; +def : M4UnitL2; +def : M4UnitL2; +def : M4UnitL2; + +def : M4UnitL2I<(instregex "(t|t2)LDM")>; + + +// Stores we use a latency of 1 as they have no outputs + +def : M4UnitL1; +def : M4UnitL1I<(instregex "(t|t2)STM")>; + + +// Everything else has a Latency of 1 + +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1I<(instregex "(t|t2)MOV")>; +def : M4UnitL1I<(instrs COPY)>; +def : M4UnitL1I<(instregex "t2IT")>; +def : M4UnitL1I<(instregex "t2SEL", "t2USAD8", + "t2(S|Q|SH|U|UQ|UH)(ADD16|ASX|SAX|SUB16|ADD8|SUB8)", "t2USADA8", "(t|t2)REV")>; + +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +// Most FP instructions are single-cycle latency, except MAC's, Div's and Sqrt's. +// Loads still take 2 cycles. + +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL2I<(instregex "VLD")>; +def : M4UnitL1I<(instregex "VST")>; +def : M4UnitL3; +def : M4UnitL3; +def : M4UnitL14; +def : M4UnitL14; +def : M4UnitL14; +def : M4UnitL14; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; +def : M4UnitL1; + +def : ReadAdvance; +def : ReadAdvance; + +} diff --git a/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll b/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll index 524424a25e02..40fd39e6eac5 100644 --- a/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll +++ b/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll @@ -5,10 +5,10 @@ ; CHECK-LABEL: add_user ; CHECK: %for.body -; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]! -; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]! -; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]] +; CHECK: ldr [[A:[rl0-9]+]],{{.*}}, #2]! +; CHECK: ldr [[B:[rl0-9]+]],{{.*}}, #2]! ; CHECK: sxtah [[COUNT:r[0-9]+]], [[COUNT]], [[A]] +; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]] define i32 @add_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { entry: %cmp24 = icmp sgt i32 %arg, 0 @@ -53,10 +53,10 @@ for.body: ; CHECK-LABEL: mul_bottom_user ; CHECK: %for.body -; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]! -; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]! -; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]] +; CHECK: ldr [[A:[rl0-9]+]],{{.*}}, #2]! +; CHECK: ldr [[B:[rl0-9]+]],{{.*}}, #2]! ; CHECK: sxth [[SXT:r[0-9]+]], [[A]] +; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]] ; CHECK: mul [[COUNT:r[0-9]+]],{{.*}}[[SXT]] define i32 @mul_bottom_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { entry: @@ -104,8 +104,8 @@ for.body: ; CHECK: %for.body ; CHECK: ldr [[A:[rl0-9]+]],{{.*}}, #2]! ; CHECK: ldr [[B:[rl0-9]+]],{{.*}}, #2]! -; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]] -; CHECK: asr.w [[ASR:[rl0-9]+]], [[B]], #16 +; CHECK: asrs [[ASR:[rl0-9]+]], [[A]], #16 +; CHECK: smlad [[ACC:[rl0-9]+]], [[A]], [[B]], [[ACC]] ; CHECK: mul [[COUNT:[rl0-9]+]],{{.}}[[ASR]] define i32 @mul_top_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { entry: @@ -151,10 +151,10 @@ for.body: ; CHECK-LABEL: and_user ; CHECK: %for.body -; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]! -; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]! -; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]] +; CHECK: ldr [[A:[rl0-9]+]],{{.*}}, #2]! +; CHECK: ldr [[B:[rl0-9]+]],{{.*}}, #2]! ; CHECK: uxth [[UXT:r[0-9]+]], [[A]] +; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]] ; CHECK: mul [[MUL:r[0-9]+]],{{.*}}[[UXT]] define i32 @and_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { entry: @@ -201,12 +201,12 @@ for.body: ; CHECK-LABEL: multi_uses ; CHECK: %for.body -; CHECK: ldr [[A:r[0-9]+]], [{{.*}}, #2]! -; CHECK: ldr [[B:r[0-9]+]], [{{.*}}, #2]! -; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]] +; CHECK: ldr [[A:[rl0-9]+]], [{{.*}}, #2]! +; CHECK: ldr [[B:[rl0-9]+]], [{{.*}}, #2]! ; CHECK: sxth [[SXT:r[0-9]+]], [[A]] +; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]] ; CHECK: eor.w [[EOR:r[0-9]+]], [[SXT]], [[SHIFT:r[0-9]+]] -; CHECK: mul [[MUL:r[0-9]+]],{{.*}}[[SXT]] +; CHECK: muls [[MUL:r[0-9]+]],{{.*}}[[SXT]] ; CHECK: lsl.w [[SHIFT]], [[MUL]], #16 define i32 @multi_uses(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { entry: diff --git a/test/CodeGen/ARM/aapcs-hfa-code.ll b/test/CodeGen/ARM/aapcs-hfa-code.ll index 5545dfdcd4c8..8d31485175af 100644 --- a/test/CodeGen/ARM/aapcs-hfa-code.ll +++ b/test/CodeGen/ARM/aapcs-hfa-code.ll @@ -76,8 +76,8 @@ define arm_aapcs_vfpcc void @test_1double_nosplit([4 x float], [4 x double], [3 ; CHECK-M4F-LABEL: test_1double_nosplit: ; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0 -; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0 ; CHECK-M4F: movt [[ONEHI]], #16368 +; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0 ; CHECK-M4F: strd [[ONELO]], [[ONEHI]], [sp] ; CHECK-M4F: bl test_1double_nosplit call arm_aapcs_vfpcc void @test_1double_nosplit([4 x float] undef, [4 x double] undef, [3 x float] undef, double 1.0) @@ -97,8 +97,8 @@ define arm_aapcs_vfpcc void @test_1double_misaligned([4 x double], [4 x double], ; CHECK-M4F-LABEL: test_1double_misaligned: ; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0 -; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0 ; CHECK-M4F: movt [[ONEHI]], #16368 +; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0 ; CHECK-M4F: strd [[ONELO]], [[ONEHI]], [sp, #8] ; CHECK-M4F: bl test_1double_misaligned diff --git a/test/CodeGen/ARM/useaa.ll b/test/CodeGen/ARM/useaa.ll index d7913e7bad90..076466d4d2c1 100644 --- a/test/CodeGen/ARM/useaa.ll +++ b/test/CodeGen/ARM/useaa.ll @@ -1,4 +1,6 @@ ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-r52 | FileCheck %s --check-prefix=CHECK --check-prefix=USEAA +; RUN: llc < %s -mtriple=armv7m-eabi -mcpu=cortex-m4 | FileCheck %s --check-prefix=CHECK --check-prefix=USEAA +; RUN: llc < %s -mtriple=armv8m-eabi -mcpu=cortex-m33 | FileCheck %s --check-prefix=CHECK --check-prefix=USEAA ; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=generic | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC ; Check we use AA during codegen, so can interleave these loads/stores. diff --git a/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll b/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll index 0c5face6c039..b6b4805b97d9 100644 --- a/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll +++ b/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll @@ -100,10 +100,10 @@ if.end: ; CHECK-BP: str ; CHECK-BP: b ; CHECK-BP: str -; CHECK-BP: ldr +; CHECK-BP: add ; CHECK-NOBP: ittee ; CHECK-NOBP: streq -; CHECK-NOBP: ldreq +; CHECK-NOBP: addeq ; CHECK-NOBP: strne ; CHECK-NOBP: strne define i32 @diamond2(i32 %n, i32* %p, i32* %q) { @@ -119,7 +119,7 @@ if.then: if.else: store i32 %n, i32* %q, align 4 - %0 = load i32, i32* %p, align 4 + %0 = add i32 %n, 10 br label %if.end if.end: diff --git a/test/CodeGen/Thumb2/m4-sched-ldr.mir b/test/CodeGen/Thumb2/m4-sched-ldr.mir new file mode 100644 index 000000000000..41abefd85a62 --- /dev/null +++ b/test/CodeGen/Thumb2/m4-sched-ldr.mir @@ -0,0 +1,60 @@ +# RUN: llc %s -run-pass machine-scheduler -o - | FileCheck %s + +# CHECK-LABEL: bb.0. +# CHECK: t2LDRi12 +# CHECK-NEXT: t2LDRi12 +# CHECK-NEXT: t2ADDri +# CHECK-NEXT: t2ADDri +--- | + target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv7em-arm-none-eabi" + + ; Function Attrs: norecurse nounwind optsize readonly + define dso_local i32 @test(i32* nocapture readonly %a, i32* nocapture readonly %b) local_unnamed_addr #0 { + entry: + %0 = load i32, i32* %a, align 4 + %add = add nsw i32 %0, 10 + %1 = load i32, i32* %b, align 4 + %add1 = add nsw i32 %1, 20 + %mul = mul nsw i32 %add1, %add + ret i32 %mul + } + + attributes #0 = { "target-cpu"="cortex-m4" } + +... +--- +name: test +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +registers: + - { id: 0, class: gpr, preferred-register: '' } + - { id: 1, class: gpr, preferred-register: '' } + - { id: 2, class: gprnopc, preferred-register: '' } + - { id: 3, class: rgpr, preferred-register: '' } + - { id: 4, class: gprnopc, preferred-register: '' } + - { id: 5, class: rgpr, preferred-register: '' } + - { id: 6, class: rgpr, preferred-register: '' } +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$r1', virtual-reg: '%1' } +body: | + bb.0.entry: + liveins: $r0, $r1 + + %1:gpr = COPY $r1 + %0:gpr = COPY $r0 + %2:gprnopc = t2LDRi12 %0, 0, 14, $noreg :: (load 4 from %ir.a) + %3:rgpr = nsw t2ADDri %2, 10, 14, $noreg, $noreg + %4:gprnopc = t2LDRi12 %1, 0, 14, $noreg :: (load 4 from %ir.b) + %5:rgpr = nsw t2ADDri %4, 20, 14, $noreg, $noreg + %6:rgpr = nsw t2MUL %5, %3, 14, $noreg + $r0 = COPY %6 + tBX_RET 14, $noreg, implicit $r0 + +... diff --git a/test/CodeGen/Thumb2/m4-sched-regs.ll b/test/CodeGen/Thumb2/m4-sched-regs.ll new file mode 100644 index 000000000000..a83da8a55cee --- /dev/null +++ b/test/CodeGen/Thumb2/m4-sched-regs.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc %s -o - | FileCheck %s + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv7em-arm-none-eabi" + +%struct.a = type { i32, %struct.b*, i8, i8, i8, i8, i8*, %struct.b*, i16, i16, i16, i16, i16, i16, i16, i16, i32, i32, i32, i32, i32, i32, i32 } +%struct.b = type { i8, i8, i8, i8, i32, i16, i16, i32, i32, i32, i32, [16 x i8], [64 x i8], [128 x i8], i32, [68 x i8] } + +define void @test(%struct.a* nocapture %dhcp, i16 zeroext %value) #0 { +; CHECK-LABEL: test: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrh r3, [r0, #20] +; CHECK-NEXT: ldr.w lr, [r0, #16] +; CHECK-NEXT: lsr.w r12, r1, #8 +; CHECK-NEXT: adds r2, r3, #1 +; CHECK-NEXT: strh r2, [r0, #20] +; CHECK-NEXT: add.w r2, lr, r3 +; CHECK-NEXT: strb.w r12, [r2, #240] +; CHECK-NEXT: ldrh r2, [r0, #20] +; CHECK-NEXT: ldr.w r12, [r0, #16] +; CHECK-NEXT: adds r3, r2, #1 +; CHECK-NEXT: strh r3, [r0, #20] +; CHECK-NEXT: add.w r0, r12, r2 +; CHECK-NEXT: strb.w r1, [r0, #240] +; CHECK-NEXT: pop {r7, pc} +entry: + %shr = lshr i16 %value, 8 + %conv1 = trunc i16 %shr to i8 + %msg_out = getelementptr inbounds %struct.a, %struct.a* %dhcp, i32 0, i32 7 + %0 = load %struct.b*, %struct.b** %msg_out, align 4 + %options_out_len = getelementptr inbounds %struct.a, %struct.a* %dhcp, i32 0, i32 8 + %1 = load i16, i16* %options_out_len, align 4 + %inc = add i16 %1, 1 + store i16 %inc, i16* %options_out_len, align 4 + %idxprom = zext i16 %1 to i32 + %arrayidx = getelementptr inbounds %struct.b, %struct.b* %0, i32 0, i32 15, i32 %idxprom + store i8 %conv1, i8* %arrayidx, align 1 + %conv4 = trunc i16 %value to i8 + %2 = load %struct.b*, %struct.b** %msg_out, align 4 + %3 = load i16, i16* %options_out_len, align 4 + %inc8 = add i16 %3, 1 + store i16 %inc8, i16* %options_out_len, align 4 + %idxprom9 = zext i16 %3 to i32 + %arrayidx10 = getelementptr inbounds %struct.b, %struct.b* %2, i32 0, i32 15, i32 %idxprom9 + store i8 %conv4, i8* %arrayidx10, align 1 + ret void +} + +attributes #0 = { minsize optsize "target-cpu"="cortex-m4" } From 769d167a6f534f9d50a57f677ca1b4eedc95b180 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 15 May 2019 12:58:02 +0000 Subject: [PATCH 09/19] [ARM] Don't use the Machine Scheduler for cortex-m at minsize The new cortex-m schedule in rL360768 helps performance, but can increase the amount of high-registers used. This, on average, ends up increasing the codesize by a fair amount (because less instructions are converted from T2 to T1). On cortex-m at -Oz, where we are quite size-paranoid, it is better to use the existing DAG scheduler with the RegPressure scheduling preference (at least until the issues around T2 vs T1 instructions can be improved). I have also made sure that the Sched::RegPressure dag scheduler is always chosen for MinSize. The test shows one case where we increase the number of registers used. Differential Revision: https://reviews.llvm.org/D61882 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360769 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMISelLowering.cpp | 2 +- lib/Target/ARM/ARMSubtarget.cpp | 7 +++++++ test/CodeGen/Thumb2/m4-sched-regs.ll | 22 ++++++++++------------ 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 94f94d3fa320..643d2806c521 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -1184,7 +1184,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setStackPointerRegisterToSaveRestore(ARM::SP); if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || - !Subtarget->hasVFP2()) + !Subtarget->hasVFP2() || Subtarget->hasMinSize()) setSchedulingPreference(Sched::RegPressure); else setSchedulingPreference(Sched::Hybrid); diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 22652d6256b9..63f694199f44 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -361,6 +361,13 @@ unsigned ARMSubtarget::getMispredictionPenalty() const { } bool ARMSubtarget::enableMachineScheduler() const { + // The MachineScheduler can increase register usage, so we use more high + // registers and end up with more T2 instructions that cannot be converted to + // T1 instructions. At least until we do better at converting to thumb1 + // instructions, on cortex-m at Oz where we are size-paranoid, don't use the + // Machine scheduler, relying on the DAG register pressure scheduler instead. + if (isMClass() && hasMinSize()) + return false; // Enable the MachineScheduler before register allocation for subtargets // with the use-misched feature. return useMachineScheduler(); diff --git a/test/CodeGen/Thumb2/m4-sched-regs.ll b/test/CodeGen/Thumb2/m4-sched-regs.ll index a83da8a55cee..29952feff070 100644 --- a/test/CodeGen/Thumb2/m4-sched-regs.ll +++ b/test/CodeGen/Thumb2/m4-sched-regs.ll @@ -10,22 +10,20 @@ target triple = "thumbv7em-arm-none-eabi" define void @test(%struct.a* nocapture %dhcp, i16 zeroext %value) #0 { ; CHECK-LABEL: test: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldrh r3, [r0, #20] -; CHECK-NEXT: ldr.w lr, [r0, #16] -; CHECK-NEXT: lsr.w r12, r1, #8 -; CHECK-NEXT: adds r2, r3, #1 -; CHECK-NEXT: strh r2, [r0, #20] -; CHECK-NEXT: add.w r2, lr, r3 -; CHECK-NEXT: strb.w r12, [r2, #240] ; CHECK-NEXT: ldrh r2, [r0, #20] -; CHECK-NEXT: ldr.w r12, [r0, #16] ; CHECK-NEXT: adds r3, r2, #1 ; CHECK-NEXT: strh r3, [r0, #20] -; CHECK-NEXT: add.w r0, r12, r2 +; CHECK-NEXT: ldr r3, [r0, #16] +; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: lsrs r3, r1, #8 +; CHECK-NEXT: strb.w r3, [r2, #240] +; CHECK-NEXT: ldrh r2, [r0, #20] +; CHECK-NEXT: adds r3, r2, #1 +; CHECK-NEXT: strh r3, [r0, #20] +; CHECK-NEXT: ldr r0, [r0, #16] +; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: strb.w r1, [r0, #240] -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %shr = lshr i16 %value, 8 %conv1 = trunc i16 %shr to i8 From 399ff58753b792a44366a734a27406c74e829607 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 15 May 2019 13:03:10 +0000 Subject: [PATCH 10/19] Revert rL360675 : [APFloat] APFloat::Storage::Storage - fix use after move This was mentioned both in https://www.viva64.com/en/b/0629/ and by scan-build checks ........ There's concerns this may just introduce a use-after-free instead..... git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360770 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Support/APFloat.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp index aea59301079e..208950d7ab71 100644 --- a/lib/Support/APFloat.cpp +++ b/lib/Support/APFloat.cpp @@ -4418,9 +4418,9 @@ APFloat::Storage::Storage(IEEEFloat F, const fltSemantics &Semantics) { return; } if (usesLayout(Semantics)) { - const fltSemantics IEEESemantics = F.getSemantics(); - new (&Double) DoubleAPFloat(Semantics, APFloat(std::move(F), IEEESemantics), - APFloat(semIEEEdouble)); + new (&Double) + DoubleAPFloat(Semantics, APFloat(std::move(F), F.getSemantics()), + APFloat(semIEEEdouble)); return; } llvm_unreachable("Unexpected semantics"); From de3c2efffebd761384ed41033bf2b60752843a6b Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Wed, 15 May 2019 13:04:24 +0000 Subject: [PATCH 11/19] [MergeICmps] Simplify the code. Instead of patching the original blocks, we now generate new blocks and delete the old blocks. This results in simpler code with a less twisted control flow (see the change in `entry-block-shuffled.ll`). This will make https://reviews.llvm.org/D60318 simpler by making it more obvious where control flow created and deleted. Reviewers: gchatelet Subscribers: hiraditya, llvm-commits, spatel Tags: #llvm Differential Revision: https://reviews.llvm.org/D61736 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360771 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/MergeICmps.cpp | 295 +++++++++--------- test/CodeGen/PowerPC/memcmp-mergeexpand.ll | 2 +- test/CodeGen/X86/memcmp-mergeexpand.ll | 4 +- .../MergeICmps/X86/alias-merge-blocks.ll | 15 +- .../MergeICmps/X86/entry-block-shuffled.ll | 52 +-- .../X86/multiple-blocks-does-work.ll | 18 +- .../MergeICmps/X86/pair-int32-int32.ll | 29 +- .../MergeICmps/X86/split-block-does-work.ll | 15 +- 8 files changed, 215 insertions(+), 215 deletions(-) diff --git a/lib/Transforms/Scalar/MergeICmps.cpp b/lib/Transforms/Scalar/MergeICmps.cpp index 9a57ed6c6dc6..d8baf504f4f1 100644 --- a/lib/Transforms/Scalar/MergeICmps.cpp +++ b/lib/Transforms/Scalar/MergeICmps.cpp @@ -48,6 +48,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include #include @@ -406,13 +407,6 @@ class BCECmpChain { First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset; } - // Merges the given comparison blocks into one memcmp block and update - // branches. Comparisons are assumed to be continguous. If NextBBInChain is - // null, the merged block will link to the phi block. - void mergeComparisons(ArrayRef Comparisons, - BasicBlock *const NextBBInChain, PHINode &Phi, - const TargetLibraryInfo *const TLI, AliasAnalysis *AA); - PHINode &Phi_; std::vector Comparisons_; // The original entry block (before sorting); @@ -452,7 +446,7 @@ BCECmpChain::BCECmpChain(const std::vector &Blocks, PHINode &Phi, // chain before sorting. Unless we can abort the chain at this point // and start anew. // - // NOTE: we only handle block with single predecessor for now. + // NOTE: we only handle blocks a with single predecessor for now. if (Comparison.canSplit(AA)) { LLVM_DEBUG(dbgs() << "Split initial block '" << Comparison.BB->getName() @@ -540,162 +534,173 @@ void BCECmpChain::dump() const { } #endif // MERGEICMPS_DOT_ON -bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI, - AliasAnalysis *AA) { - // First pass to check if there is at least one merge. If not, we don't do - // anything and we keep analysis passes intact. - { - bool AtLeastOneMerged = false; - for (size_t I = 1; I < Comparisons_.size(); ++I) { - if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) { - AtLeastOneMerged = true; - break; - } - } - if (!AtLeastOneMerged) return false; - } +namespace { - // Remove phi references to comparison blocks, they will be rebuilt as we - // merge the blocks. - for (const auto &Comparison : Comparisons_) { - Phi_.removeIncomingValue(Comparison.BB, false); - } +// A class to compute the name of a set of merged basic blocks. +// This is optimized for the common case of no block names. +class MergedBlockName { + // Storage for the uncommon case of several named blocks. + SmallString<16> Scratch; - // If entry block is part of the chain, we need to make the first block - // of the chain the new entry block of the function. - BasicBlock *Entry = &Comparisons_[0].BB->getParent()->getEntryBlock(); - for (size_t I = 1; I < Comparisons_.size(); ++I) { - if (Entry == Comparisons_[I].BB) { - BasicBlock *NEntryBB = BasicBlock::Create(Entry->getContext(), "", - Entry->getParent(), Entry); - BranchInst::Create(Entry, NEntryBB); - break; - } - } - - // Point the predecessors of the chain to the first comparison block (which is - // the new entry point) and update the entry block of the chain. - if (EntryBlock_ != Comparisons_[0].BB) { - EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB); - EntryBlock_ = Comparisons_[0].BB; - } +public: + explicit MergedBlockName(ArrayRef Comparisons) + : Name(makeTwine(Comparisons)) {} + const Twine Name; - // Effectively merge blocks. - int NumMerged = 1; - for (size_t I = 1; I < Comparisons_.size(); ++I) { - if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) { - ++NumMerged; - } else { - // Merge all previous comparisons and start a new merge block. - mergeComparisons( - makeArrayRef(Comparisons_).slice(I - NumMerged, NumMerged), - Comparisons_[I].BB, Phi_, TLI, AA); - NumMerged = 1; +private: + Twine makeTwine(ArrayRef Comparisons) { + assert(!Comparisons.empty() && "no basic block"); + // Fast path: only one block, or no names at all. + if (Comparisons.size() == 1) + return Comparisons[0].BB->getName(); + const int size = std::accumulate(Comparisons.begin(), Comparisons.end(), 0, + [](int i, const BCECmpBlock &Cmp) { + return i + Cmp.BB->getName().size(); + }); + if (size == 0) + return Twine(); + + // Slow path: at least two blocks, at least one block with a name. + Scratch.clear(); + // We'll have `size` bytes for name and `Comparisons.size() - 1` bytes for + // separators. + Scratch.reserve(size + Comparisons.size() - 1); + const auto append = [this](StringRef str) { + Scratch.append(str.begin(), str.end()); + }; + append(Comparisons[0].BB->getName()); + for (int I = 1, E = Comparisons.size(); I < E; ++I) { + const BasicBlock *const BB = Comparisons[I].BB; + if (!BB->getName().empty()) { + append("+"); + append(BB->getName()); + } } + return Twine(Scratch); } - mergeComparisons(makeArrayRef(Comparisons_) - .slice(Comparisons_.size() - NumMerged, NumMerged), - nullptr, Phi_, TLI, AA); - - return true; -} +}; +} // namespace + +// Merges the given contiguous comparison blocks into one memcmp block. +static BasicBlock *mergeComparisons(ArrayRef Comparisons, + BasicBlock *const NextCmpBlock, + PHINode &Phi, + const TargetLibraryInfo *const TLI, + AliasAnalysis *AA) { + assert(!Comparisons.empty() && "merging zero comparisons"); + LLVMContext &Context = NextCmpBlock->getContext(); + const BCECmpBlock &FirstCmp = Comparisons[0]; + + // Create a new cmp block before next cmp block. + BasicBlock *const BB = + BasicBlock::Create(Context, MergedBlockName(Comparisons).Name, + NextCmpBlock->getParent(), NextCmpBlock); + IRBuilder<> Builder(BB); + // Add the GEPs from the first BCECmpBlock. + Value *const Lhs = Builder.Insert(FirstCmp.Lhs().GEP->clone()); + Value *const Rhs = Builder.Insert(FirstCmp.Rhs().GEP->clone()); + + Value *IsEqual = nullptr; + if (Comparisons.size() == 1) { + LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n"); + Value *const LhsLoad = + Builder.CreateLoad(FirstCmp.Lhs().LoadI->getType(), Lhs); + Value *const RhsLoad = + Builder.CreateLoad(FirstCmp.Rhs().LoadI->getType(), Rhs); + // There are no blocks to merge, just do the comparison. + IsEqual = Builder.CreateICmpEQ(LhsLoad, RhsLoad); + } else { + LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n"); -void BCECmpChain::mergeComparisons(ArrayRef Comparisons, - BasicBlock *const NextBBInChain, - PHINode &Phi, - const TargetLibraryInfo *const TLI, - AliasAnalysis *AA) { - assert(!Comparisons.empty()); - const auto &FirstComparison = *Comparisons.begin(); - BasicBlock *const BB = FirstComparison.BB; - LLVMContext &Context = BB->getContext(); - - if (Comparisons.size() >= 2) { // If there is one block that requires splitting, we do it now, i.e. // just before we know we will collapse the chain. The instructions // can be executed before any of the instructions in the chain. - auto C = std::find_if(Comparisons.begin(), Comparisons.end(), - [](const BCECmpBlock &B) { return B.RequireSplit; }); - if (C != Comparisons.end()) - C->split(EntryBlock_, AA); + const auto ToSplit = + std::find_if(Comparisons.begin(), Comparisons.end(), + [](const BCECmpBlock &B) { return B.RequireSplit; }); + if (ToSplit != Comparisons.end()) { + LLVM_DEBUG(dbgs() << "Splitting non_BCE work to header\n"); + ToSplit->split(BB, AA); + } - LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n"); - const auto TotalSize = - std::accumulate(Comparisons.begin(), Comparisons.end(), 0, - [](int Size, const BCECmpBlock &C) { - return Size + C.SizeBits(); - }) / - 8; - - // Incoming edges do not need to be updated, and both GEPs are already - // computing the right address, we just need to: - // - replace the two loads and the icmp with the memcmp - // - update the branch - // - update the incoming values in the phi. - FirstComparison.BranchI->eraseFromParent(); - FirstComparison.CmpI->eraseFromParent(); - FirstComparison.Lhs().LoadI->eraseFromParent(); - FirstComparison.Rhs().LoadI->eraseFromParent(); - - IRBuilder<> Builder(BB); + const unsigned TotalSizeBits = std::accumulate( + Comparisons.begin(), Comparisons.end(), 0u, + [](int Size, const BCECmpBlock &C) { return Size + C.SizeBits(); }); + + // Create memcmp() == 0. const auto &DL = Phi.getModule()->getDataLayout(); Value *const MemCmpCall = emitMemCmp( - FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP, - ConstantInt::get(DL.getIntPtrType(Context), TotalSize), - Builder, DL, TLI); - Value *const MemCmpIsZero = Builder.CreateICmpEQ( + Lhs, Rhs, + ConstantInt::get(DL.getIntPtrType(Context), TotalSizeBits / 8), Builder, + DL, TLI); + IsEqual = Builder.CreateICmpEQ( MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0)); + } - // Add a branch to the next basic block in the chain. - if (NextBBInChain) { - Builder.CreateCondBr(MemCmpIsZero, NextBBInChain, Phi.getParent()); - Phi.addIncoming(ConstantInt::getFalse(Context), BB); - } else { - Builder.CreateBr(Phi.getParent()); - Phi.addIncoming(MemCmpIsZero, BB); - } + BasicBlock *const PhiBB = Phi.getParent(); + // Add a branch to the next basic block in the chain. + if (NextCmpBlock == PhiBB) { + // Continue to phi, passing it the comparison result. + Builder.CreateBr(Phi.getParent()); + Phi.addIncoming(IsEqual, BB); + } else { + // Continue to next block if equal, exit to phi else. + Builder.CreateCondBr(IsEqual, NextCmpBlock, PhiBB); + Phi.addIncoming(ConstantInt::getFalse(Context), BB); + } + return BB; +} - // Delete merged blocks. - for (size_t I = 1; I < Comparisons.size(); ++I) { - BasicBlock *CBB = Comparisons[I].BB; - CBB->replaceAllUsesWith(BB); - CBB->eraseFromParent(); +bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI, + AliasAnalysis *AA) { + assert(Comparisons_.size() >= 2 && "simplifying trivial BCECmpChain"); + // First pass to check if there is at least one merge. If not, we don't do + // anything and we keep analysis passes intact. + const auto AtLeastOneMerged = [this]() { + for (size_t I = 1; I < Comparisons_.size(); ++I) { + if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) + return true; } - } else { - assert(Comparisons.size() == 1); - // There are no blocks to merge, but we still need to update the branches. - LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n"); - if (NextBBInChain) { - if (FirstComparison.BranchI->isConditional()) { - LLVM_DEBUG(dbgs() << "conditional -> conditional\n"); - // Just update the "true" target, the "false" target should already be - // the phi block. - assert(FirstComparison.BranchI->getSuccessor(1) == Phi.getParent()); - FirstComparison.BranchI->setSuccessor(0, NextBBInChain); - Phi.addIncoming(ConstantInt::getFalse(Context), BB); - } else { - LLVM_DEBUG(dbgs() << "unconditional -> conditional\n"); - // Replace the unconditional branch by a conditional one. - FirstComparison.BranchI->eraseFromParent(); - IRBuilder<> Builder(BB); - Builder.CreateCondBr(FirstComparison.CmpI, NextBBInChain, - Phi.getParent()); - Phi.addIncoming(FirstComparison.CmpI, BB); - } + return false; + }; + if (!AtLeastOneMerged()) + return false; + + // Effectively merge blocks. We go in the reverse direction from the phi block + // so that the next block is always available to branch to. + const auto mergeRange = [this, TLI, AA](int I, int Num, BasicBlock *Next) { + return mergeComparisons(makeArrayRef(Comparisons_).slice(I, Num), Next, + Phi_, TLI, AA); + }; + int NumMerged = 1; + BasicBlock *NextCmpBlock = Phi_.getParent(); + for (int I = static_cast(Comparisons_.size()) - 2; I >= 0; --I) { + if (IsContiguous(Comparisons_[I], Comparisons_[I + 1])) { + ++NumMerged; } else { - if (FirstComparison.BranchI->isConditional()) { - LLVM_DEBUG(dbgs() << "conditional -> unconditional\n"); - // Replace the conditional branch by an unconditional one. - FirstComparison.BranchI->eraseFromParent(); - IRBuilder<> Builder(BB); - Builder.CreateBr(Phi.getParent()); - Phi.addIncoming(FirstComparison.CmpI, BB); - } else { - LLVM_DEBUG(dbgs() << "unconditional -> unconditional\n"); - Phi.addIncoming(FirstComparison.CmpI, BB); - } + NextCmpBlock = mergeRange(I + 1, NumMerged, NextCmpBlock); + NumMerged = 1; } } + NextCmpBlock = mergeRange(0, NumMerged, NextCmpBlock); + + // Replace the original cmp chain with the new cmp chain by pointing all + // predecessors of EntryBlock_ to NextCmpBlock instead. This makes all cmp + // blocks in the old chain unreachable. + for (BasicBlock *Pred : predecessors(EntryBlock_)) { + Pred->getTerminator()->replaceUsesOfWith(EntryBlock_, NextCmpBlock); + } + EntryBlock_ = nullptr; + + // Delete merged blocks. This also removes incoming values in phi. + SmallVector DeadBlocks; + for (auto &Cmp : Comparisons_) { + DeadBlocks.push_back(Cmp.BB); + } + DeleteDeadBlocks(DeadBlocks); + + Comparisons_.clear(); + return true; } std::vector getOrderedBlocks(PHINode &Phi, diff --git a/test/CodeGen/PowerPC/memcmp-mergeexpand.ll b/test/CodeGen/PowerPC/memcmp-mergeexpand.ll index c1e81074c863..298ce90b74ee 100644 --- a/test/CodeGen/PowerPC/memcmp-mergeexpand.ll +++ b/test/CodeGen/PowerPC/memcmp-mergeexpand.ll @@ -7,7 +7,7 @@ define zeroext i1 @opeq1( ; PPC64LE-LABEL: opeq1: -; PPC64LE: # %bb.0: # %entry +; PPC64LE: # %bb.0: # %"entry+land.rhs.i" ; PPC64LE-NEXT: ld 3, 0(3) ; PPC64LE-NEXT: ld 4, 0(4) ; PPC64LE-NEXT: xor 3, 3, 4 diff --git a/test/CodeGen/X86/memcmp-mergeexpand.ll b/test/CodeGen/X86/memcmp-mergeexpand.ll index 785ba403465e..0be463daaeb0 100644 --- a/test/CodeGen/X86/memcmp-mergeexpand.ll +++ b/test/CodeGen/X86/memcmp-mergeexpand.ll @@ -8,7 +8,7 @@ define zeroext i1 @opeq1( ; X86-LABEL: opeq1: -; X86: # %bb.0: # %entry +; X86: # %bb.0: # %"entry+land.rhs.i" ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %edx @@ -20,7 +20,7 @@ define zeroext i1 @opeq1( ; X86-NEXT: retl ; ; X64-LABEL: opeq1: -; X64: # %bb.0: # %entry +; X64: # %bb.0: # %"entry+land.rhs.i" ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax ; X64-NEXT: sete %al diff --git a/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll b/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll index fa4af66e6392..00c70fba9c97 100644 --- a/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll +++ b/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll @@ -5,19 +5,18 @@ define zeroext i1 @opeq1( ; X86-LABEL: @opeq1( -; X86-NEXT: entry: +; X86-NEXT: "entry+land.rhs.i+land.rhs.i.2+land.rhs.i.3": ; X86-NEXT: [[PTR:%.*]] = alloca i32 ; X86-NEXT: store i32 42, i32* [[PTR]] -; X86-NEXT: [[FIRST_I:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 -; X86-NEXT: [[FIRST1_I:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 -; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[FIRST_I]] to i8* -; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[FIRST1_I]] to i8* +; X86-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 +; X86-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 +; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8* +; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[TMP1]] to i8* ; X86-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 16) -; X86-NEXT: [[TMP0:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; X86-NEXT: [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0 ; X86-NEXT: br label [[OPEQ1_EXIT:%.*]] ; X86: opeq1.exit: -; X86-NEXT: [[TMP1:%.*]] = phi i1 [ [[TMP0]], [[ENTRY:%.*]] ] -; X86-NEXT: ret i1 [[TMP1]] +; X86-NEXT: ret i1 [[TMP2]] ; %S* nocapture readonly dereferenceable(16) %a, %S* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 { diff --git a/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll b/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll index f416fa451eba..2123b7969c30 100644 --- a/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll +++ b/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll @@ -3,37 +3,37 @@ %S = type { i32, i32, i32, i32 } -; The entry block is part of the chain. It however can not be merged. We need to make the -; first comparison block in the chain the new entry block of the function. +; The entry block is part of the chain. It however can not be merged. We need to +; make sure that the control flow is still consistent (goes through each of the +; blocks). define zeroext i1 @opeq1( ; CHECK-LABEL: @opeq1( -; CHECK-NEXT: br label [[LAND_RHS_I:%.*]] -; CHECK: entry: -; CHECK-NEXT: [[FIRST_I:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[FIRST_I]], align 4 -; CHECK-NEXT: [[FIRST1_I:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 2 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[FIRST1_I]], align 4 -; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[TMP1]], [[TMP2]] -; CHECK-NEXT: br i1 [[CMP_I]], label [[LAND_RHS_I_3:%.*]], label [[OPEQ1_EXIT:%.*]] -; CHECK: land.rhs.i: -; CHECK-NEXT: [[SECOND_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 0 -; CHECK-NEXT: [[SECOND2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 0 -; CHECK-NEXT: [[CSTR:%.*]] = bitcast i32* [[SECOND_I]] to i8* -; CHECK-NEXT: [[CSTR1:%.*]] = bitcast i32* [[SECOND2_I]] to i8* -; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 8) -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[MEMCMP]], 0 -; CHECK-NEXT: br i1 [[TMP3]], label [[ENTRY:%.*]], label [[OPEQ1_EXIT]] -; CHECK: land.rhs.i.3: -; CHECK-NEXT: [[FOURTH_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[FOURTH_I]], align 4 -; CHECK-NEXT: [[FOURTH2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[FOURTH2_I]], align 4 -; CHECK-NEXT: [[CMP5_I:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] +; CHECK-NEXT: "land.rhs.i+land.rhs.i.2": +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8* +; CHECK-NEXT: [[CSTR3:%.*]] = bitcast i32* [[TMP1]] to i8* +; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR3]], i64 8) +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[ENTRY2:%.*]], label [[OPEQ1_EXIT:%.*]] +; CHECK: entry2: +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]] +; CHECK-NEXT: br i1 [[TMP7]], label [[LAND_RHS_I_31:%.*]], label [[OPEQ1_EXIT]] +; CHECK: land.rhs.i.31: +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP10]], [[TMP11]] ; CHECK-NEXT: br label [[OPEQ1_EXIT]] ; CHECK: opeq1.exit: -; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ false, [[LAND_RHS_I]] ], [ false, [[ENTRY]] ], [ [[CMP5_I]], [[LAND_RHS_I_3]] ] -; CHECK-NEXT: ret i1 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = phi i1 [ [[TMP12]], [[LAND_RHS_I_31]] ], [ false, [[ENTRY2]] ], [ false, %"land.rhs.i+land.rhs.i.2" ] +; CHECK-NEXT: ret i1 [[TMP13]] ; %S* nocapture readonly dereferenceable(16) %a, %S* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 { diff --git a/test/Transforms/MergeICmps/X86/multiple-blocks-does-work.ll b/test/Transforms/MergeICmps/X86/multiple-blocks-does-work.ll index 790c0e9854d4..0a75d3bdd01a 100644 --- a/test/Transforms/MergeICmps/X86/multiple-blocks-does-work.ll +++ b/test/Transforms/MergeICmps/X86/multiple-blocks-does-work.ll @@ -23,18 +23,18 @@ define zeroext i1 @opeq1( ; X86-NEXT: [[TMP3:%.*]] = load i32, i32* [[SECOND2_I]], align 4 ; X86-NEXT: call void (...) @foo() ; X86-NEXT: [[CMP2_I:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]] -; X86-NEXT: br i1 [[CMP2_I]], label [[LAND_RHS_I_2:%.*]], label [[OPEQ1_EXIT]] -; X86: land.rhs.i.2: -; X86-NEXT: [[THIRD_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 2 -; X86-NEXT: [[THIRD2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 2 -; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[THIRD_I]] to i8* -; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[THIRD2_I]] to i8* +; X86-NEXT: br i1 [[CMP2_I]], label %"land.rhs.i.2+land.rhs.i.3", label [[OPEQ1_EXIT]] +; X86: "land.rhs.i.2+land.rhs.i.3": +; X86-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 2 +; X86-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 2 +; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[TMP4]] to i8* +; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[TMP5]] to i8* ; X86-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 8) -; X86-NEXT: [[TMP4:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; X86-NEXT: [[TMP6:%.*]] = icmp eq i32 [[MEMCMP]], 0 ; X86-NEXT: br label [[OPEQ1_EXIT]] ; X86: opeq1.exit: -; X86-NEXT: [[TMP5:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ false, [[LAND_RHS_I]] ], [ [[TMP4]], [[LAND_RHS_I_2]] ] -; X86-NEXT: ret i1 [[TMP5]] +; X86-NEXT: [[TMP7:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ false, [[LAND_RHS_I]] ], [ [[TMP6]], %"land.rhs.i.2+land.rhs.i.3" ] +; X86-NEXT: ret i1 [[TMP7]] ; %S* nocapture readonly dereferenceable(16) %a, %S* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 { diff --git a/test/Transforms/MergeICmps/X86/pair-int32-int32.ll b/test/Transforms/MergeICmps/X86/pair-int32-int32.ll index 13f2f4874d8d..0a6a681e9d97 100644 --- a/test/Transforms/MergeICmps/X86/pair-int32-int32.ll +++ b/test/Transforms/MergeICmps/X86/pair-int32-int32.ll @@ -6,17 +6,16 @@ define zeroext i1 @opeq1( ; X86-LABEL: @opeq1( -; X86-NEXT: entry: -; X86-NEXT: [[FIRST_I:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 -; X86-NEXT: [[FIRST1_I:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 -; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[FIRST_I]] to i8* -; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[FIRST1_I]] to i8* +; X86-NEXT: "entry+land.rhs.i": +; X86-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 +; X86-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 +; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8* +; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[TMP1]] to i8* ; X86-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 8) -; X86-NEXT: [[TMP0:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; X86-NEXT: [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0 ; X86-NEXT: br label [[OPEQ1_EXIT:%.*]] ; X86: opeq1.exit: -; X86-NEXT: [[TMP1:%.*]] = phi i1 [ [[TMP0]], [[ENTRY:%.*]] ] -; X86-NEXT: ret i1 [[TMP1]] +; X86-NEXT: ret i1 [[TMP2]] ; ; X86-NOBUILTIN-LABEL: @opeq1( ; X86-NOBUILTIN-NEXT: entry: @@ -67,17 +66,15 @@ opeq1.exit: ; Same as above, but the two blocks are in inverse order. define zeroext i1 @opeq1_inverse( ; X86-LABEL: @opeq1_inverse( -; X86-NEXT: br label [[LAND_RHS_I:%.*]] -; X86: land.rhs.i: -; X86-NEXT: [[SECOND_I:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 -; X86-NEXT: [[SECOND2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 -; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[SECOND_I]] to i8* -; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[SECOND2_I]] to i8* +; X86-NEXT: "land.rhs.i+entry": +; X86-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 +; X86-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 +; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8* +; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[TMP1]] to i8* ; X86-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 8) -; X86-NEXT: [[TMP1:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; X86-NEXT: [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0 ; X86-NEXT: br label [[OPEQ1_EXIT:%.*]] ; X86: opeq1.exit: -; X86-NEXT: [[TMP2:%.*]] = phi i1 [ [[TMP1]], [[LAND_RHS_I]] ] ; X86-NEXT: ret i1 [[TMP2]] ; ; X86-NOBUILTIN-LABEL: @opeq1_inverse( diff --git a/test/Transforms/MergeICmps/X86/split-block-does-work.ll b/test/Transforms/MergeICmps/X86/split-block-does-work.ll index 91ef9b1622cd..63283edd0ca2 100644 --- a/test/Transforms/MergeICmps/X86/split-block-does-work.ll +++ b/test/Transforms/MergeICmps/X86/split-block-does-work.ll @@ -8,18 +8,17 @@ declare void @foo(...) nounwind readnone ; We can split %entry and create a memcmp(16 bytes). define zeroext i1 @opeq1( ; X86-LABEL: @opeq1( -; X86-NEXT: entry: +; X86-NEXT: "entry+land.rhs.i+land.rhs.i.2+land.rhs.i.3": ; X86-NEXT: call void (...) @foo() -; X86-NEXT: [[FIRST_I:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 -; X86-NEXT: [[FIRST1_I:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 -; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[FIRST_I]] to i8* -; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[FIRST1_I]] to i8* +; X86-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 +; X86-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 +; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8* +; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[TMP1]] to i8* ; X86-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 16) -; X86-NEXT: [[TMP0:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; X86-NEXT: [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0 ; X86-NEXT: br label [[OPEQ1_EXIT:%.*]] ; X86: opeq1.exit: -; X86-NEXT: [[TMP1:%.*]] = phi i1 [ [[TMP0]], [[ENTRY:%.*]] ] -; X86-NEXT: ret i1 [[TMP1]] +; X86-NEXT: ret i1 [[TMP2]] ; ; Make sure this call is moved to the beginning of the entry block. %S* nocapture readonly dereferenceable(16) %a, From 76562e6377b52be26aa3812b649e0f14734cf226 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Wed, 15 May 2019 13:15:48 +0000 Subject: [PATCH 12/19] [Salvage] Change salvage debug info implementation to use DW_OP_LLVM_convert where needed Fixes issue: https://bugs.llvm.org/show_bug.cgi?id=40645 Previously, LLVM had no functional way of performing casts inside of a DIExpression(), which made salvaging cast instructions other than Noop casts impossible. With the recent addition of DW_OP_LLVM_convert this salvaging is now possible, and so can be used to fix the attached bug as well as any cases where SExt instruction results are lost in the debugging metadata. This patch introduces this fix by expanding the salvage debug info method to cover these cases using the new operator. Differential revision: https://reviews.llvm.org/D61184 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360772 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Utils/Local.cpp | 23 +++++++++++++++-- test/DebugInfo/salvage-cast-debug-info.ll | 25 +++++++++++++++++++ .../Transforms/InstCombine/cast-mul-select.ll | 2 ++ 3 files changed, 48 insertions(+), 2 deletions(-) create mode 100755 test/DebugInfo/salvage-cast-debug-info.ll diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 2a4e9054273b..04e6cbb20c33 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -1690,8 +1690,27 @@ DIExpression *llvm::salvageDebugInfoImpl(Instruction &I, // No-op casts and zexts are irrelevant for debug info. if (CI->isNoopCast(DL) || isa(&I)) return SrcDIExpr; - return nullptr; - } else if (auto *GEP = dyn_cast(&I)) { + + Type *Type = CI->getType(); + // Casts other than Trunc or SExt to scalar types cannot be salvaged. + if (Type->isVectorTy() || (!isa(&I) && !isa(&I))) + return nullptr; + + Value *FromValue = CI->getOperand(0); + unsigned FromTypeBitSize = FromValue->getType()->getScalarSizeInBits(); + + unsigned ToTypeBitSize = Type->getScalarSizeInBits(); + + // The result of the cast will be sign extended iff the instruction is a + // SExt; signedness is otherwise irrelevant on the expression stack. + unsigned Encoding = + isa(&I) ? dwarf::DW_ATE_signed : dwarf::DW_ATE_unsigned; + + return applyOps({dwarf::DW_OP_LLVM_convert, FromTypeBitSize, Encoding, + dwarf::DW_OP_LLVM_convert, ToTypeBitSize, Encoding}); + } + + if (auto *GEP = dyn_cast(&I)) { unsigned BitWidth = M.getDataLayout().getIndexSizeInBits(GEP->getPointerAddressSpace()); // Rewrite a constant GEP into a DIExpression. diff --git a/test/DebugInfo/salvage-cast-debug-info.ll b/test/DebugInfo/salvage-cast-debug-info.ll new file mode 100755 index 000000000000..1c8196a6589e --- /dev/null +++ b/test/DebugInfo/salvage-cast-debug-info.ll @@ -0,0 +1,25 @@ +; RUN: opt %s -debugify -early-cse -S | FileCheck %s +define i32 @foo(i64 %nose, i32 %more) { +; CHECK-LABEL: @foo( +; CHECK: call void @llvm.dbg.value(metadata i64 %nose, metadata [[V1:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_convert, 64, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned +; CHECK: call void @llvm.dbg.value(metadata i64 %nose.shift, metadata [[V2:![0-9]+]] +; CHECK: call void @llvm.dbg.value(metadata i64 %nose.shift, metadata [[V3:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_convert, 64, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned + +entry: + %nose.trunc = trunc i64 %nose to i32 + %nose.shift = lshr i64 %nose, 32 + %nose.trunc.2 = trunc i64 %nose.shift to i32 + %add = add nsw i32 %more, 1 + ret i32 %add +} + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 2} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{!"clang version 9.0.0 "} + +; CHECK: [[V1]] = !DILocalVariable( +; CHECK: [[V2]] = !DILocalVariable( +; CHECK: [[V3]] = !DILocalVariable( diff --git a/test/Transforms/InstCombine/cast-mul-select.ll b/test/Transforms/InstCombine/cast-mul-select.ll index c501fd8d04cd..f82d2fd285fe 100644 --- a/test/Transforms/InstCombine/cast-mul-select.ll +++ b/test/Transforms/InstCombine/cast-mul-select.ll @@ -13,6 +13,8 @@ define i32 @mul(i32 %x, i32 %y) { ; we preserve the debug information in the resulting ; instruction. ; DBGINFO-LABEL: @mul( +; DBGINFO-NEXT: call void @llvm.dbg.value(metadata i32 %x +; DBGINFO-NEXT: call void @llvm.dbg.value(metadata i32 %y ; DBGINFO-NEXT: [[C:%.*]] = mul i32 {{.*}} ; DBGINFO-NEXT: [[D:%.*]] = and i32 {{.*}} ; DBGINFO-NEXT: call void @llvm.dbg.value(metadata i32 [[C]] From dabe45c6d5c291c80e82ecb5466910357d04f7ce Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Wed, 15 May 2019 13:41:44 +0000 Subject: [PATCH 13/19] Revert "[Salvage] Change salvage debug info implementation to use DW_OP_LLVM_convert where needed" This reverts r360772 due to build issues. Reverted commit: 17dd4d7403770bd683675e45f5517e0cdb8f9b2b. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360773 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Utils/Local.cpp | 23 ++--------------- test/DebugInfo/salvage-cast-debug-info.ll | 25 ------------------- .../Transforms/InstCombine/cast-mul-select.ll | 2 -- 3 files changed, 2 insertions(+), 48 deletions(-) delete mode 100755 test/DebugInfo/salvage-cast-debug-info.ll diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 04e6cbb20c33..2a4e9054273b 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -1690,27 +1690,8 @@ DIExpression *llvm::salvageDebugInfoImpl(Instruction &I, // No-op casts and zexts are irrelevant for debug info. if (CI->isNoopCast(DL) || isa(&I)) return SrcDIExpr; - - Type *Type = CI->getType(); - // Casts other than Trunc or SExt to scalar types cannot be salvaged. - if (Type->isVectorTy() || (!isa(&I) && !isa(&I))) - return nullptr; - - Value *FromValue = CI->getOperand(0); - unsigned FromTypeBitSize = FromValue->getType()->getScalarSizeInBits(); - - unsigned ToTypeBitSize = Type->getScalarSizeInBits(); - - // The result of the cast will be sign extended iff the instruction is a - // SExt; signedness is otherwise irrelevant on the expression stack. - unsigned Encoding = - isa(&I) ? dwarf::DW_ATE_signed : dwarf::DW_ATE_unsigned; - - return applyOps({dwarf::DW_OP_LLVM_convert, FromTypeBitSize, Encoding, - dwarf::DW_OP_LLVM_convert, ToTypeBitSize, Encoding}); - } - - if (auto *GEP = dyn_cast(&I)) { + return nullptr; + } else if (auto *GEP = dyn_cast(&I)) { unsigned BitWidth = M.getDataLayout().getIndexSizeInBits(GEP->getPointerAddressSpace()); // Rewrite a constant GEP into a DIExpression. diff --git a/test/DebugInfo/salvage-cast-debug-info.ll b/test/DebugInfo/salvage-cast-debug-info.ll deleted file mode 100755 index 1c8196a6589e..000000000000 --- a/test/DebugInfo/salvage-cast-debug-info.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: opt %s -debugify -early-cse -S | FileCheck %s -define i32 @foo(i64 %nose, i32 %more) { -; CHECK-LABEL: @foo( -; CHECK: call void @llvm.dbg.value(metadata i64 %nose, metadata [[V1:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_convert, 64, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned -; CHECK: call void @llvm.dbg.value(metadata i64 %nose.shift, metadata [[V2:![0-9]+]] -; CHECK: call void @llvm.dbg.value(metadata i64 %nose.shift, metadata [[V3:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_convert, 64, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned - -entry: - %nose.trunc = trunc i64 %nose to i32 - %nose.shift = lshr i64 %nose, 32 - %nose.trunc.2 = trunc i64 %nose.shift to i32 - %add = add nsw i32 %more, 1 - ret i32 %add -} - -!llvm.module.flags = !{!0, !1} -!llvm.ident = !{!2} - -!0 = !{i32 1, !"wchar_size", i32 2} -!1 = !{i32 7, !"PIC Level", i32 2} -!2 = !{!"clang version 9.0.0 "} - -; CHECK: [[V1]] = !DILocalVariable( -; CHECK: [[V2]] = !DILocalVariable( -; CHECK: [[V3]] = !DILocalVariable( diff --git a/test/Transforms/InstCombine/cast-mul-select.ll b/test/Transforms/InstCombine/cast-mul-select.ll index f82d2fd285fe..c501fd8d04cd 100644 --- a/test/Transforms/InstCombine/cast-mul-select.ll +++ b/test/Transforms/InstCombine/cast-mul-select.ll @@ -13,8 +13,6 @@ define i32 @mul(i32 %x, i32 %y) { ; we preserve the debug information in the resulting ; instruction. ; DBGINFO-LABEL: @mul( -; DBGINFO-NEXT: call void @llvm.dbg.value(metadata i32 %x -; DBGINFO-NEXT: call void @llvm.dbg.value(metadata i32 %y ; DBGINFO-NEXT: [[C:%.*]] = mul i32 {{.*}} ; DBGINFO-NEXT: [[D:%.*]] = and i32 {{.*}} ; DBGINFO-NEXT: call void @llvm.dbg.value(metadata i32 [[C]] From 2b69712b6e7e9160f12991730bcc864deb3594a0 Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Wed, 15 May 2019 14:00:45 +0000 Subject: [PATCH 14/19] [MergeICmps] Fix r360771. Twine references a StringRef by reference, not value... git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360775 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/MergeICmps.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/Transforms/Scalar/MergeICmps.cpp b/lib/Transforms/Scalar/MergeICmps.cpp index d8baf504f4f1..82d186250df2 100644 --- a/lib/Transforms/Scalar/MergeICmps.cpp +++ b/lib/Transforms/Scalar/MergeICmps.cpp @@ -544,11 +544,11 @@ class MergedBlockName { public: explicit MergedBlockName(ArrayRef Comparisons) - : Name(makeTwine(Comparisons)) {} - const Twine Name; + : Name(makeName(Comparisons)) {} + const StringRef Name; private: - Twine makeTwine(ArrayRef Comparisons) { + StringRef makeName(ArrayRef Comparisons) { assert(!Comparisons.empty() && "no basic block"); // Fast path: only one block, or no names at all. if (Comparisons.size() == 1) @@ -558,7 +558,7 @@ class MergedBlockName { return i + Cmp.BB->getName().size(); }); if (size == 0) - return Twine(); + return StringRef("", 0); // Slow path: at least two blocks, at least one block with a name. Scratch.clear(); @@ -576,7 +576,7 @@ class MergedBlockName { append(BB->getName()); } } - return Twine(Scratch); + return StringRef(Scratch); } }; } // namespace From 1e3d019b42b75147cd239ba2f434ec3c28822436 Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Wed, 15 May 2019 14:21:59 +0000 Subject: [PATCH 15/19] Revert r360771 "[MergeICmps] Simplify the code." Breaks a bunch of builbdots. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360776 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/MergeICmps.cpp | 295 +++++++++--------- test/CodeGen/PowerPC/memcmp-mergeexpand.ll | 2 +- test/CodeGen/X86/memcmp-mergeexpand.ll | 4 +- .../MergeICmps/X86/alias-merge-blocks.ll | 15 +- .../MergeICmps/X86/entry-block-shuffled.ll | 52 +-- .../X86/multiple-blocks-does-work.ll | 18 +- .../MergeICmps/X86/pair-int32-int32.ll | 29 +- .../MergeICmps/X86/split-block-does-work.ll | 15 +- 8 files changed, 215 insertions(+), 215 deletions(-) diff --git a/lib/Transforms/Scalar/MergeICmps.cpp b/lib/Transforms/Scalar/MergeICmps.cpp index 82d186250df2..9a57ed6c6dc6 100644 --- a/lib/Transforms/Scalar/MergeICmps.cpp +++ b/lib/Transforms/Scalar/MergeICmps.cpp @@ -48,7 +48,6 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include #include @@ -407,6 +406,13 @@ class BCECmpChain { First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset; } + // Merges the given comparison blocks into one memcmp block and update + // branches. Comparisons are assumed to be continguous. If NextBBInChain is + // null, the merged block will link to the phi block. + void mergeComparisons(ArrayRef Comparisons, + BasicBlock *const NextBBInChain, PHINode &Phi, + const TargetLibraryInfo *const TLI, AliasAnalysis *AA); + PHINode &Phi_; std::vector Comparisons_; // The original entry block (before sorting); @@ -446,7 +452,7 @@ BCECmpChain::BCECmpChain(const std::vector &Blocks, PHINode &Phi, // chain before sorting. Unless we can abort the chain at this point // and start anew. // - // NOTE: we only handle blocks a with single predecessor for now. + // NOTE: we only handle block with single predecessor for now. if (Comparison.canSplit(AA)) { LLVM_DEBUG(dbgs() << "Split initial block '" << Comparison.BB->getName() @@ -534,173 +540,162 @@ void BCECmpChain::dump() const { } #endif // MERGEICMPS_DOT_ON -namespace { +bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI, + AliasAnalysis *AA) { + // First pass to check if there is at least one merge. If not, we don't do + // anything and we keep analysis passes intact. + { + bool AtLeastOneMerged = false; + for (size_t I = 1; I < Comparisons_.size(); ++I) { + if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) { + AtLeastOneMerged = true; + break; + } + } + if (!AtLeastOneMerged) return false; + } -// A class to compute the name of a set of merged basic blocks. -// This is optimized for the common case of no block names. -class MergedBlockName { - // Storage for the uncommon case of several named blocks. - SmallString<16> Scratch; + // Remove phi references to comparison blocks, they will be rebuilt as we + // merge the blocks. + for (const auto &Comparison : Comparisons_) { + Phi_.removeIncomingValue(Comparison.BB, false); + } -public: - explicit MergedBlockName(ArrayRef Comparisons) - : Name(makeName(Comparisons)) {} - const StringRef Name; + // If entry block is part of the chain, we need to make the first block + // of the chain the new entry block of the function. + BasicBlock *Entry = &Comparisons_[0].BB->getParent()->getEntryBlock(); + for (size_t I = 1; I < Comparisons_.size(); ++I) { + if (Entry == Comparisons_[I].BB) { + BasicBlock *NEntryBB = BasicBlock::Create(Entry->getContext(), "", + Entry->getParent(), Entry); + BranchInst::Create(Entry, NEntryBB); + break; + } + } -private: - StringRef makeName(ArrayRef Comparisons) { - assert(!Comparisons.empty() && "no basic block"); - // Fast path: only one block, or no names at all. - if (Comparisons.size() == 1) - return Comparisons[0].BB->getName(); - const int size = std::accumulate(Comparisons.begin(), Comparisons.end(), 0, - [](int i, const BCECmpBlock &Cmp) { - return i + Cmp.BB->getName().size(); - }); - if (size == 0) - return StringRef("", 0); - - // Slow path: at least two blocks, at least one block with a name. - Scratch.clear(); - // We'll have `size` bytes for name and `Comparisons.size() - 1` bytes for - // separators. - Scratch.reserve(size + Comparisons.size() - 1); - const auto append = [this](StringRef str) { - Scratch.append(str.begin(), str.end()); - }; - append(Comparisons[0].BB->getName()); - for (int I = 1, E = Comparisons.size(); I < E; ++I) { - const BasicBlock *const BB = Comparisons[I].BB; - if (!BB->getName().empty()) { - append("+"); - append(BB->getName()); - } + // Point the predecessors of the chain to the first comparison block (which is + // the new entry point) and update the entry block of the chain. + if (EntryBlock_ != Comparisons_[0].BB) { + EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB); + EntryBlock_ = Comparisons_[0].BB; + } + + // Effectively merge blocks. + int NumMerged = 1; + for (size_t I = 1; I < Comparisons_.size(); ++I) { + if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) { + ++NumMerged; + } else { + // Merge all previous comparisons and start a new merge block. + mergeComparisons( + makeArrayRef(Comparisons_).slice(I - NumMerged, NumMerged), + Comparisons_[I].BB, Phi_, TLI, AA); + NumMerged = 1; } - return StringRef(Scratch); } -}; -} // namespace - -// Merges the given contiguous comparison blocks into one memcmp block. -static BasicBlock *mergeComparisons(ArrayRef Comparisons, - BasicBlock *const NextCmpBlock, - PHINode &Phi, - const TargetLibraryInfo *const TLI, - AliasAnalysis *AA) { - assert(!Comparisons.empty() && "merging zero comparisons"); - LLVMContext &Context = NextCmpBlock->getContext(); - const BCECmpBlock &FirstCmp = Comparisons[0]; - - // Create a new cmp block before next cmp block. - BasicBlock *const BB = - BasicBlock::Create(Context, MergedBlockName(Comparisons).Name, - NextCmpBlock->getParent(), NextCmpBlock); - IRBuilder<> Builder(BB); - // Add the GEPs from the first BCECmpBlock. - Value *const Lhs = Builder.Insert(FirstCmp.Lhs().GEP->clone()); - Value *const Rhs = Builder.Insert(FirstCmp.Rhs().GEP->clone()); - - Value *IsEqual = nullptr; - if (Comparisons.size() == 1) { - LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n"); - Value *const LhsLoad = - Builder.CreateLoad(FirstCmp.Lhs().LoadI->getType(), Lhs); - Value *const RhsLoad = - Builder.CreateLoad(FirstCmp.Rhs().LoadI->getType(), Rhs); - // There are no blocks to merge, just do the comparison. - IsEqual = Builder.CreateICmpEQ(LhsLoad, RhsLoad); - } else { - LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n"); + mergeComparisons(makeArrayRef(Comparisons_) + .slice(Comparisons_.size() - NumMerged, NumMerged), + nullptr, Phi_, TLI, AA); + return true; +} + +void BCECmpChain::mergeComparisons(ArrayRef Comparisons, + BasicBlock *const NextBBInChain, + PHINode &Phi, + const TargetLibraryInfo *const TLI, + AliasAnalysis *AA) { + assert(!Comparisons.empty()); + const auto &FirstComparison = *Comparisons.begin(); + BasicBlock *const BB = FirstComparison.BB; + LLVMContext &Context = BB->getContext(); + + if (Comparisons.size() >= 2) { // If there is one block that requires splitting, we do it now, i.e. // just before we know we will collapse the chain. The instructions // can be executed before any of the instructions in the chain. - const auto ToSplit = - std::find_if(Comparisons.begin(), Comparisons.end(), - [](const BCECmpBlock &B) { return B.RequireSplit; }); - if (ToSplit != Comparisons.end()) { - LLVM_DEBUG(dbgs() << "Splitting non_BCE work to header\n"); - ToSplit->split(BB, AA); - } + auto C = std::find_if(Comparisons.begin(), Comparisons.end(), + [](const BCECmpBlock &B) { return B.RequireSplit; }); + if (C != Comparisons.end()) + C->split(EntryBlock_, AA); - const unsigned TotalSizeBits = std::accumulate( - Comparisons.begin(), Comparisons.end(), 0u, - [](int Size, const BCECmpBlock &C) { return Size + C.SizeBits(); }); - - // Create memcmp() == 0. + LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n"); + const auto TotalSize = + std::accumulate(Comparisons.begin(), Comparisons.end(), 0, + [](int Size, const BCECmpBlock &C) { + return Size + C.SizeBits(); + }) / + 8; + + // Incoming edges do not need to be updated, and both GEPs are already + // computing the right address, we just need to: + // - replace the two loads and the icmp with the memcmp + // - update the branch + // - update the incoming values in the phi. + FirstComparison.BranchI->eraseFromParent(); + FirstComparison.CmpI->eraseFromParent(); + FirstComparison.Lhs().LoadI->eraseFromParent(); + FirstComparison.Rhs().LoadI->eraseFromParent(); + + IRBuilder<> Builder(BB); const auto &DL = Phi.getModule()->getDataLayout(); Value *const MemCmpCall = emitMemCmp( - Lhs, Rhs, - ConstantInt::get(DL.getIntPtrType(Context), TotalSizeBits / 8), Builder, - DL, TLI); - IsEqual = Builder.CreateICmpEQ( + FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP, + ConstantInt::get(DL.getIntPtrType(Context), TotalSize), + Builder, DL, TLI); + Value *const MemCmpIsZero = Builder.CreateICmpEQ( MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0)); - } - - BasicBlock *const PhiBB = Phi.getParent(); - // Add a branch to the next basic block in the chain. - if (NextCmpBlock == PhiBB) { - // Continue to phi, passing it the comparison result. - Builder.CreateBr(Phi.getParent()); - Phi.addIncoming(IsEqual, BB); - } else { - // Continue to next block if equal, exit to phi else. - Builder.CreateCondBr(IsEqual, NextCmpBlock, PhiBB); - Phi.addIncoming(ConstantInt::getFalse(Context), BB); - } - return BB; -} -bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI, - AliasAnalysis *AA) { - assert(Comparisons_.size() >= 2 && "simplifying trivial BCECmpChain"); - // First pass to check if there is at least one merge. If not, we don't do - // anything and we keep analysis passes intact. - const auto AtLeastOneMerged = [this]() { - for (size_t I = 1; I < Comparisons_.size(); ++I) { - if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) - return true; + // Add a branch to the next basic block in the chain. + if (NextBBInChain) { + Builder.CreateCondBr(MemCmpIsZero, NextBBInChain, Phi.getParent()); + Phi.addIncoming(ConstantInt::getFalse(Context), BB); + } else { + Builder.CreateBr(Phi.getParent()); + Phi.addIncoming(MemCmpIsZero, BB); } - return false; - }; - if (!AtLeastOneMerged()) - return false; - // Effectively merge blocks. We go in the reverse direction from the phi block - // so that the next block is always available to branch to. - const auto mergeRange = [this, TLI, AA](int I, int Num, BasicBlock *Next) { - return mergeComparisons(makeArrayRef(Comparisons_).slice(I, Num), Next, - Phi_, TLI, AA); - }; - int NumMerged = 1; - BasicBlock *NextCmpBlock = Phi_.getParent(); - for (int I = static_cast(Comparisons_.size()) - 2; I >= 0; --I) { - if (IsContiguous(Comparisons_[I], Comparisons_[I + 1])) { - ++NumMerged; + // Delete merged blocks. + for (size_t I = 1; I < Comparisons.size(); ++I) { + BasicBlock *CBB = Comparisons[I].BB; + CBB->replaceAllUsesWith(BB); + CBB->eraseFromParent(); + } + } else { + assert(Comparisons.size() == 1); + // There are no blocks to merge, but we still need to update the branches. + LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n"); + if (NextBBInChain) { + if (FirstComparison.BranchI->isConditional()) { + LLVM_DEBUG(dbgs() << "conditional -> conditional\n"); + // Just update the "true" target, the "false" target should already be + // the phi block. + assert(FirstComparison.BranchI->getSuccessor(1) == Phi.getParent()); + FirstComparison.BranchI->setSuccessor(0, NextBBInChain); + Phi.addIncoming(ConstantInt::getFalse(Context), BB); + } else { + LLVM_DEBUG(dbgs() << "unconditional -> conditional\n"); + // Replace the unconditional branch by a conditional one. + FirstComparison.BranchI->eraseFromParent(); + IRBuilder<> Builder(BB); + Builder.CreateCondBr(FirstComparison.CmpI, NextBBInChain, + Phi.getParent()); + Phi.addIncoming(FirstComparison.CmpI, BB); + } } else { - NextCmpBlock = mergeRange(I + 1, NumMerged, NextCmpBlock); - NumMerged = 1; + if (FirstComparison.BranchI->isConditional()) { + LLVM_DEBUG(dbgs() << "conditional -> unconditional\n"); + // Replace the conditional branch by an unconditional one. + FirstComparison.BranchI->eraseFromParent(); + IRBuilder<> Builder(BB); + Builder.CreateBr(Phi.getParent()); + Phi.addIncoming(FirstComparison.CmpI, BB); + } else { + LLVM_DEBUG(dbgs() << "unconditional -> unconditional\n"); + Phi.addIncoming(FirstComparison.CmpI, BB); + } } } - NextCmpBlock = mergeRange(0, NumMerged, NextCmpBlock); - - // Replace the original cmp chain with the new cmp chain by pointing all - // predecessors of EntryBlock_ to NextCmpBlock instead. This makes all cmp - // blocks in the old chain unreachable. - for (BasicBlock *Pred : predecessors(EntryBlock_)) { - Pred->getTerminator()->replaceUsesOfWith(EntryBlock_, NextCmpBlock); - } - EntryBlock_ = nullptr; - - // Delete merged blocks. This also removes incoming values in phi. - SmallVector DeadBlocks; - for (auto &Cmp : Comparisons_) { - DeadBlocks.push_back(Cmp.BB); - } - DeleteDeadBlocks(DeadBlocks); - - Comparisons_.clear(); - return true; } std::vector getOrderedBlocks(PHINode &Phi, diff --git a/test/CodeGen/PowerPC/memcmp-mergeexpand.ll b/test/CodeGen/PowerPC/memcmp-mergeexpand.ll index 298ce90b74ee..c1e81074c863 100644 --- a/test/CodeGen/PowerPC/memcmp-mergeexpand.ll +++ b/test/CodeGen/PowerPC/memcmp-mergeexpand.ll @@ -7,7 +7,7 @@ define zeroext i1 @opeq1( ; PPC64LE-LABEL: opeq1: -; PPC64LE: # %bb.0: # %"entry+land.rhs.i" +; PPC64LE: # %bb.0: # %entry ; PPC64LE-NEXT: ld 3, 0(3) ; PPC64LE-NEXT: ld 4, 0(4) ; PPC64LE-NEXT: xor 3, 3, 4 diff --git a/test/CodeGen/X86/memcmp-mergeexpand.ll b/test/CodeGen/X86/memcmp-mergeexpand.ll index 0be463daaeb0..785ba403465e 100644 --- a/test/CodeGen/X86/memcmp-mergeexpand.ll +++ b/test/CodeGen/X86/memcmp-mergeexpand.ll @@ -8,7 +8,7 @@ define zeroext i1 @opeq1( ; X86-LABEL: opeq1: -; X86: # %bb.0: # %"entry+land.rhs.i" +; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %edx @@ -20,7 +20,7 @@ define zeroext i1 @opeq1( ; X86-NEXT: retl ; ; X64-LABEL: opeq1: -; X64: # %bb.0: # %"entry+land.rhs.i" +; X64: # %bb.0: # %entry ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: cmpq (%rsi), %rax ; X64-NEXT: sete %al diff --git a/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll b/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll index 00c70fba9c97..fa4af66e6392 100644 --- a/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll +++ b/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll @@ -5,18 +5,19 @@ define zeroext i1 @opeq1( ; X86-LABEL: @opeq1( -; X86-NEXT: "entry+land.rhs.i+land.rhs.i.2+land.rhs.i.3": +; X86-NEXT: entry: ; X86-NEXT: [[PTR:%.*]] = alloca i32 ; X86-NEXT: store i32 42, i32* [[PTR]] -; X86-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 -; X86-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 -; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8* -; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[TMP1]] to i8* +; X86-NEXT: [[FIRST_I:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 +; X86-NEXT: [[FIRST1_I:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 +; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[FIRST_I]] to i8* +; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[FIRST1_I]] to i8* ; X86-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 16) -; X86-NEXT: [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; X86-NEXT: [[TMP0:%.*]] = icmp eq i32 [[MEMCMP]], 0 ; X86-NEXT: br label [[OPEQ1_EXIT:%.*]] ; X86: opeq1.exit: -; X86-NEXT: ret i1 [[TMP2]] +; X86-NEXT: [[TMP1:%.*]] = phi i1 [ [[TMP0]], [[ENTRY:%.*]] ] +; X86-NEXT: ret i1 [[TMP1]] ; %S* nocapture readonly dereferenceable(16) %a, %S* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 { diff --git a/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll b/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll index 2123b7969c30..f416fa451eba 100644 --- a/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll +++ b/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll @@ -3,37 +3,37 @@ %S = type { i32, i32, i32, i32 } -; The entry block is part of the chain. It however can not be merged. We need to -; make sure that the control flow is still consistent (goes through each of the -; blocks). +; The entry block is part of the chain. It however can not be merged. We need to make the +; first comparison block in the chain the new entry block of the function. define zeroext i1 @opeq1( ; CHECK-LABEL: @opeq1( -; CHECK-NEXT: "land.rhs.i+land.rhs.i.2": -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 -; CHECK-NEXT: [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8* -; CHECK-NEXT: [[CSTR3:%.*]] = bitcast i32* [[TMP1]] to i8* -; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR3]], i64 8) -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[ENTRY2:%.*]], label [[OPEQ1_EXIT:%.*]] -; CHECK: entry2: -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]] -; CHECK-NEXT: br i1 [[TMP7]], label [[LAND_RHS_I_31:%.*]], label [[OPEQ1_EXIT]] -; CHECK: land.rhs.i.31: -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 3 -; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP10]], [[TMP11]] +; CHECK-NEXT: br label [[LAND_RHS_I:%.*]] +; CHECK: entry: +; CHECK-NEXT: [[FIRST_I:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[FIRST_I]], align 4 +; CHECK-NEXT: [[FIRST1_I:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[FIRST1_I]], align 4 +; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: br i1 [[CMP_I]], label [[LAND_RHS_I_3:%.*]], label [[OPEQ1_EXIT:%.*]] +; CHECK: land.rhs.i: +; CHECK-NEXT: [[SECOND_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 0 +; CHECK-NEXT: [[SECOND2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 0 +; CHECK-NEXT: [[CSTR:%.*]] = bitcast i32* [[SECOND_I]] to i8* +; CHECK-NEXT: [[CSTR1:%.*]] = bitcast i32* [[SECOND2_I]] to i8* +; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 8) +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[ENTRY:%.*]], label [[OPEQ1_EXIT]] +; CHECK: land.rhs.i.3: +; CHECK-NEXT: [[FOURTH_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[FOURTH_I]], align 4 +; CHECK-NEXT: [[FOURTH2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[FOURTH2_I]], align 4 +; CHECK-NEXT: [[CMP5_I:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]] ; CHECK-NEXT: br label [[OPEQ1_EXIT]] ; CHECK: opeq1.exit: -; CHECK-NEXT: [[TMP13:%.*]] = phi i1 [ [[TMP12]], [[LAND_RHS_I_31]] ], [ false, [[ENTRY2]] ], [ false, %"land.rhs.i+land.rhs.i.2" ] -; CHECK-NEXT: ret i1 [[TMP13]] +; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ false, [[LAND_RHS_I]] ], [ false, [[ENTRY]] ], [ [[CMP5_I]], [[LAND_RHS_I_3]] ] +; CHECK-NEXT: ret i1 [[TMP6]] ; %S* nocapture readonly dereferenceable(16) %a, %S* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 { diff --git a/test/Transforms/MergeICmps/X86/multiple-blocks-does-work.ll b/test/Transforms/MergeICmps/X86/multiple-blocks-does-work.ll index 0a75d3bdd01a..790c0e9854d4 100644 --- a/test/Transforms/MergeICmps/X86/multiple-blocks-does-work.ll +++ b/test/Transforms/MergeICmps/X86/multiple-blocks-does-work.ll @@ -23,18 +23,18 @@ define zeroext i1 @opeq1( ; X86-NEXT: [[TMP3:%.*]] = load i32, i32* [[SECOND2_I]], align 4 ; X86-NEXT: call void (...) @foo() ; X86-NEXT: [[CMP2_I:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]] -; X86-NEXT: br i1 [[CMP2_I]], label %"land.rhs.i.2+land.rhs.i.3", label [[OPEQ1_EXIT]] -; X86: "land.rhs.i.2+land.rhs.i.3": -; X86-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 2 -; X86-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 2 -; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[TMP4]] to i8* -; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[TMP5]] to i8* +; X86-NEXT: br i1 [[CMP2_I]], label [[LAND_RHS_I_2:%.*]], label [[OPEQ1_EXIT]] +; X86: land.rhs.i.2: +; X86-NEXT: [[THIRD_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 2 +; X86-NEXT: [[THIRD2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 2 +; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[THIRD_I]] to i8* +; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[THIRD2_I]] to i8* ; X86-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 8) -; X86-NEXT: [[TMP6:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; X86-NEXT: [[TMP4:%.*]] = icmp eq i32 [[MEMCMP]], 0 ; X86-NEXT: br label [[OPEQ1_EXIT]] ; X86: opeq1.exit: -; X86-NEXT: [[TMP7:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ false, [[LAND_RHS_I]] ], [ [[TMP6]], %"land.rhs.i.2+land.rhs.i.3" ] -; X86-NEXT: ret i1 [[TMP7]] +; X86-NEXT: [[TMP5:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ false, [[LAND_RHS_I]] ], [ [[TMP4]], [[LAND_RHS_I_2]] ] +; X86-NEXT: ret i1 [[TMP5]] ; %S* nocapture readonly dereferenceable(16) %a, %S* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 { diff --git a/test/Transforms/MergeICmps/X86/pair-int32-int32.ll b/test/Transforms/MergeICmps/X86/pair-int32-int32.ll index 0a6a681e9d97..13f2f4874d8d 100644 --- a/test/Transforms/MergeICmps/X86/pair-int32-int32.ll +++ b/test/Transforms/MergeICmps/X86/pair-int32-int32.ll @@ -6,16 +6,17 @@ define zeroext i1 @opeq1( ; X86-LABEL: @opeq1( -; X86-NEXT: "entry+land.rhs.i": -; X86-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 -; X86-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 -; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8* -; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[TMP1]] to i8* +; X86-NEXT: entry: +; X86-NEXT: [[FIRST_I:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 +; X86-NEXT: [[FIRST1_I:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 +; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[FIRST_I]] to i8* +; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[FIRST1_I]] to i8* ; X86-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 8) -; X86-NEXT: [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; X86-NEXT: [[TMP0:%.*]] = icmp eq i32 [[MEMCMP]], 0 ; X86-NEXT: br label [[OPEQ1_EXIT:%.*]] ; X86: opeq1.exit: -; X86-NEXT: ret i1 [[TMP2]] +; X86-NEXT: [[TMP1:%.*]] = phi i1 [ [[TMP0]], [[ENTRY:%.*]] ] +; X86-NEXT: ret i1 [[TMP1]] ; ; X86-NOBUILTIN-LABEL: @opeq1( ; X86-NOBUILTIN-NEXT: entry: @@ -66,15 +67,17 @@ opeq1.exit: ; Same as above, but the two blocks are in inverse order. define zeroext i1 @opeq1_inverse( ; X86-LABEL: @opeq1_inverse( -; X86-NEXT: "land.rhs.i+entry": -; X86-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 -; X86-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 -; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8* -; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[TMP1]] to i8* +; X86-NEXT: br label [[LAND_RHS_I:%.*]] +; X86: land.rhs.i: +; X86-NEXT: [[SECOND_I:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 +; X86-NEXT: [[SECOND2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 +; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[SECOND_I]] to i8* +; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[SECOND2_I]] to i8* ; X86-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 8) -; X86-NEXT: [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; X86-NEXT: [[TMP1:%.*]] = icmp eq i32 [[MEMCMP]], 0 ; X86-NEXT: br label [[OPEQ1_EXIT:%.*]] ; X86: opeq1.exit: +; X86-NEXT: [[TMP2:%.*]] = phi i1 [ [[TMP1]], [[LAND_RHS_I]] ] ; X86-NEXT: ret i1 [[TMP2]] ; ; X86-NOBUILTIN-LABEL: @opeq1_inverse( diff --git a/test/Transforms/MergeICmps/X86/split-block-does-work.ll b/test/Transforms/MergeICmps/X86/split-block-does-work.ll index 63283edd0ca2..91ef9b1622cd 100644 --- a/test/Transforms/MergeICmps/X86/split-block-does-work.ll +++ b/test/Transforms/MergeICmps/X86/split-block-does-work.ll @@ -8,17 +8,18 @@ declare void @foo(...) nounwind readnone ; We can split %entry and create a memcmp(16 bytes). define zeroext i1 @opeq1( ; X86-LABEL: @opeq1( -; X86-NEXT: "entry+land.rhs.i+land.rhs.i.2+land.rhs.i.3": +; X86-NEXT: entry: ; X86-NEXT: call void (...) @foo() -; X86-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 -; X86-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 -; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8* -; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[TMP1]] to i8* +; X86-NEXT: [[FIRST_I:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0 +; X86-NEXT: [[FIRST1_I:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0 +; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[FIRST_I]] to i8* +; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[FIRST1_I]] to i8* ; X86-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 16) -; X86-NEXT: [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; X86-NEXT: [[TMP0:%.*]] = icmp eq i32 [[MEMCMP]], 0 ; X86-NEXT: br label [[OPEQ1_EXIT:%.*]] ; X86: opeq1.exit: -; X86-NEXT: ret i1 [[TMP2]] +; X86-NEXT: [[TMP1:%.*]] = phi i1 [ [[TMP0]], [[ENTRY:%.*]] ] +; X86-NEXT: ret i1 [[TMP1]] ; ; Make sure this call is moved to the beginning of the entry block. %S* nocapture readonly dereferenceable(16) %a, From 917249a6e65c18c6638e4164d661faefb3c60908 Mon Sep 17 00:00:00 2001 From: Cameron McInally Date: Wed, 15 May 2019 14:31:33 +0000 Subject: [PATCH 16/19] Teach InstSimplify -X + X --> 0.0 about unary FNeg Differential Revision: https://reviews.llvm.org/D61916 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360777 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Analysis/InstructionSimplify.cpp | 14 ++++++++---- test/Transforms/InstSimplify/fast-math.ll | 26 +++++++++++++++++++---- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp index febc1d5fe9be..770fdf9054c1 100644 --- a/lib/Analysis/InstructionSimplify.cpp +++ b/lib/Analysis/InstructionSimplify.cpp @@ -4316,16 +4316,22 @@ static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF, (FMF.noSignedZeros() || CannotBeNegativeZero(Op0, Q.TLI))) return Op0; - // With nnan: (+/-0.0 - X) + X --> 0.0 (and commuted variant) + // With nnan: -X + X --> 0.0 (and commuted variant) // We don't have to explicitly exclude infinities (ninf): INF + -INF == NaN. // Negative zeros are allowed because we always end up with positive zero: // X = -0.0: (-0.0 - (-0.0)) + (-0.0) == ( 0.0) + (-0.0) == 0.0 // X = -0.0: ( 0.0 - (-0.0)) + (-0.0) == ( 0.0) + (-0.0) == 0.0 // X = 0.0: (-0.0 - ( 0.0)) + ( 0.0) == (-0.0) + ( 0.0) == 0.0 // X = 0.0: ( 0.0 - ( 0.0)) + ( 0.0) == ( 0.0) + ( 0.0) == 0.0 - if (FMF.noNaNs() && (match(Op0, m_FSub(m_AnyZeroFP(), m_Specific(Op1))) || - match(Op1, m_FSub(m_AnyZeroFP(), m_Specific(Op0))))) - return ConstantFP::getNullValue(Op0->getType()); + if (FMF.noNaNs()) { + if (match(Op0, m_FSub(m_AnyZeroFP(), m_Specific(Op1))) || + match(Op1, m_FSub(m_AnyZeroFP(), m_Specific(Op0)))) + return ConstantFP::getNullValue(Op0->getType()); + + if (match(Op0, m_FNeg(m_Specific(Op1))) || + match(Op1, m_FNeg(m_Specific(Op0)))) + return ConstantFP::getNullValue(Op0->getType()); + } // (X - Y) + Y --> X // Y + (X - Y) --> X diff --git a/test/Transforms/InstSimplify/fast-math.ll b/test/Transforms/InstSimplify/fast-math.ll index 08fb6112e57a..5f981ed125ed 100644 --- a/test/Transforms/InstSimplify/fast-math.ll +++ b/test/Transforms/InstSimplify/fast-math.ll @@ -56,8 +56,8 @@ define float @no_mul_zero_3(float %a) { ; -X + X --> 0.0 (with nnan on the fadd) -define float @fadd_fnegx(float %x) { -; CHECK-LABEL: @fadd_fnegx( +define float @fadd_binary_fnegx(float %x) { +; CHECK-LABEL: @fadd_binary_fnegx( ; CHECK-NEXT: ret float 0.000000e+00 ; %negx = fsub float -0.0, %x @@ -65,10 +65,19 @@ define float @fadd_fnegx(float %x) { ret float %r } +define float @fadd_unary_fnegx(float %x) { +; CHECK-LABEL: @fadd_unary_fnegx( +; CHECK-NEXT: ret float 0.000000e+00 +; + %negx = fneg float %x + %r = fadd nnan float %negx, %x + ret float %r +} + ; X + -X --> 0.0 (with nnan on the fadd) -define <2 x float> @fadd_fnegx_commute_vec(<2 x float> %x) { -; CHECK-LABEL: @fadd_fnegx_commute_vec( +define <2 x float> @fadd_binary_fnegx_commute_vec(<2 x float> %x) { +; CHECK-LABEL: @fadd_binary_fnegx_commute_vec( ; CHECK-NEXT: ret <2 x float> zeroinitializer ; %negx = fsub <2 x float> , %x @@ -76,6 +85,15 @@ define <2 x float> @fadd_fnegx_commute_vec(<2 x float> %x) { ret <2 x float> %r } +define <2 x float> @fadd_unary_fnegx_commute_vec(<2 x float> %x) { +; CHECK-LABEL: @fadd_unary_fnegx_commute_vec( +; CHECK-NEXT: ret <2 x float> zeroinitializer +; + %negx = fneg <2 x float> %x + %r = fadd nnan <2 x float> %x, %negx + ret <2 x float> %r +} + define <2 x float> @fadd_fnegx_commute_vec_undef(<2 x float> %x) { ; CHECK-LABEL: @fadd_fnegx_commute_vec_undef( ; CHECK-NEXT: ret <2 x float> zeroinitializer From 2b8edd42a8e62d5557adf6bb27bbc463b20df66b Mon Sep 17 00:00:00 2001 From: Ryan Taylor Date: Wed, 15 May 2019 14:43:55 +0000 Subject: [PATCH 17/19] [AMDGPU] Increases available SGPR for Calling Convention Summary: SGPR in CC can be either hw initialized or set by other chained shaders and so this increases the SGPR count availalbe to CC to 105. Change-Id: I3dfadc750fe4a3e2bd07117a2899fd13f3e2fef3 Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61261 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360778 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPUCallingConv.td | 22 +- lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 +- test/CodeGen/AMDGPU/sgpr-limit.ll | 265 +++++++++++++++++++++++ 3 files changed, 287 insertions(+), 4 deletions(-) create mode 100644 test/CodeGen/AMDGPU/sgpr-limit.ll diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td index deb2bd8fbdbc..8389058e3f73 100644 --- a/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -23,7 +23,16 @@ def CC_SI : CallingConv<[ SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31, - SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39 + SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39, + SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47, + SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55, + SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63, + SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71, + SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79, + SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87, + SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95, + SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103, + SGPR104, SGPR105 ]>>>, // We have no way of referring to the generated register tuples @@ -59,7 +68,16 @@ def RetCC_SI_Shader : CallingConv<[ SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31, - SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39 + SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39, + SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47, + SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55, + SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63, + SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71, + SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79, + SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87, + SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95, + SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103, + SGPR104, SGPR105 ]>>, // 32*4 + 4 is the minimum for a fetch shader with 32 outputs. diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 409fbfa22f38..1f813ef412e5 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -63,9 +63,9 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, case MVT::v2f32: case MVT::v4i16: case MVT::v4f16: { - // Up to SGPR0-SGPR39 + // Up to SGPR0-SGPR105 return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, - &AMDGPU::SGPR_64RegClass, 20); + &AMDGPU::SGPR_64RegClass, 53); } default: return false; diff --git a/test/CodeGen/AMDGPU/sgpr-limit.ll b/test/CodeGen/AMDGPU/sgpr-limit.ll new file mode 100644 index 000000000000..364cfd880db7 --- /dev/null +++ b/test/CodeGen/AMDGPU/sgpr-limit.ll @@ -0,0 +1,265 @@ +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s + +; CHECK: s_add_i32 s0, s0, s1 +; CHECK: s_add_i32 s1, s0, s2 +; CHECK: s_add_i32 s2, s1, s3 +; CHECK: s_add_i32 s3, s2, s4 +; CHECK: s_add_i32 s4, s3, s5 +; CHECK: s_add_i32 s5, s4, s6 +; CHECK: s_add_i32 s6, s5, s7 +; CHECK: s_add_i32 s7, s6, s8 +; CHECK: s_add_i32 s8, s7, s9 +; CHECK: s_add_i32 s9, s8, s10 +; CHECK: s_add_i32 s10, s9, s11 +; CHECK: s_add_i32 s11, s10, s12 +; CHECK: s_add_i32 s12, s11, s13 +; CHECK: s_add_i32 s13, s12, s14 +; CHECK: s_add_i32 s14, s13, s15 +; CHECK: s_add_i32 s15, s14, s16 +; CHECK: s_add_i32 s16, s15, s17 +; CHECK: s_add_i32 s17, s16, s18 +; CHECK: s_add_i32 s18, s17, s19 +; CHECK: s_add_i32 s19, s18, s20 +; CHECK: s_add_i32 s20, s19, s21 +; CHECK: s_add_i32 s21, s20, s22 +; CHECK: s_add_i32 s22, s21, s23 +; CHECK: s_add_i32 s23, s22, s24 +; CHECK: s_add_i32 s24, s23, s25 +; CHECK: s_add_i32 s25, s24, s26 +; CHECK: s_add_i32 s26, s25, s27 +; CHECK: s_add_i32 s27, s26, s28 +; CHECK: s_add_i32 s28, s27, s29 +; CHECK: s_add_i32 s29, s28, s30 +; CHECK: s_add_i32 s30, s29, s31 +; CHECK: s_add_i32 s31, s30, s32 +; CHECK: s_add_i32 s32, s31, s33 +; CHECK: s_add_i32 s33, s32, s34 +; CHECK: s_add_i32 s34, s33, s35 +; CHECK: s_add_i32 s35, s34, s36 +; CHECK: s_add_i32 s36, s35, s37 +; CHECK: s_add_i32 s37, s36, s38 +; CHECK: s_add_i32 s38, s37, s39 +; CHECK: s_add_i32 s39, s38, s40 +; CHECK: s_add_i32 s40, s39, s41 +; CHECK: s_add_i32 s41, s40, s42 +; CHECK: s_add_i32 s42, s41, s43 +; CHECK: s_add_i32 s43, s42, s44 +; CHECK: s_add_i32 s44, s43, s45 +; CHECK: s_add_i32 s45, s44, s46 +; CHECK: s_add_i32 s46, s45, s47 +; CHECK: s_add_i32 s47, s46, s48 +; CHECK: s_add_i32 s48, s47, s49 +; CHECK: s_add_i32 s49, s48, s50 +; CHECK: s_add_i32 s50, s49, s51 +; CHECK: s_add_i32 s51, s50, s52 +; CHECK: s_add_i32 s52, s51, s53 +; CHECK: s_add_i32 s53, s52, s54 +; CHECK: s_add_i32 s54, s53, s55 +; CHECK: s_add_i32 s55, s54, s56 +; CHECK: s_add_i32 s56, s55, s57 +; CHECK: s_add_i32 s57, s56, s58 +; CHECK: s_add_i32 s58, s57, s59 +; CHECK: s_add_i32 s59, s58, s60 +; CHECK: s_add_i32 s60, s59, s61 +; CHECK: s_add_i32 s61, s60, s62 +; CHECK: s_add_i32 s62, s61, s63 +define amdgpu_gs { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } @_amdgpu_gs_sgpr_limit_i32 (i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, <4 x i32> inreg) { +.entry: + %65 = add i32 %0, %1 + %66 = add i32 %65, %2 + %67 = add i32 %66, %3 + %68 = add i32 %67, %4 + %69 = add i32 %68, %5 + %70 = add i32 %69, %6 + %71 = add i32 %70, %7 + %72 = add i32 %71, %8 + %73 = add i32 %72, %9 + %74 = add i32 %73, %10 + %75 = add i32 %74, %11 + %76 = add i32 %75, %12 + %77 = add i32 %76, %13 + %78 = add i32 %77, %14 + %79 = add i32 %78, %15 + %80 = add i32 %79, %16 + %81 = add i32 %80, %17 + %82 = add i32 %81, %18 + %83 = add i32 %82, %19 + %84 = add i32 %83, %20 + %85 = add i32 %84, %21 + %86 = add i32 %85, %22 + %87 = add i32 %86, %23 + %88 = add i32 %87, %24 + %89 = add i32 %88, %25 + %90 = add i32 %89, %26 + %91 = add i32 %90, %27 + %92 = add i32 %91, %28 + %93 = add i32 %92, %29 + %94 = add i32 %93, %30 + %95 = add i32 %94, %31 + %96 = add i32 %95, %32 + %97 = add i32 %96, %33 + %98 = add i32 %97, %34 + %99 = add i32 %98, %35 + %100 = add i32 %99, %36 + %101 = add i32 %100, %37 + %102 = add i32 %101, %38 + %103 = add i32 %102, %39 + %104 = add i32 %103, %40 + %105 = add i32 %104, %41 + %106 = add i32 %105, %42 + %107 = add i32 %106, %43 + %108 = add i32 %107, %44 + %109 = add i32 %108, %45 + %110 = add i32 %109, %46 + %111 = add i32 %110, %47 + %112 = add i32 %111, %48 + %113 = add i32 %112, %49 + %114 = add i32 %113, %50 + %115 = add i32 %114, %51 + %116 = add i32 %115, %52 + %117 = add i32 %116, %53 + %118 = add i32 %117, %54 + %119 = add i32 %118, %55 + %120 = add i32 %119, %56 + %121 = add i32 %120, %57 + %122 = add i32 %121, %58 + %123 = add i32 %122, %59 + %124 = add i32 %123, %60 + %125 = add i32 %124, %61 + %126 = add i32 %125, %62 + %127 = add i32 %126, %63 +%128 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } undef, i32 %65, 0 +%129 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %128, i32 %66, 1 +%130 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %129, i32 %67, 2 +%131 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %130, i32 %68, 3 +%132 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %131, i32 %69, 4 +%133 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %132, i32 %70, 5 +%134 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %133, i32 %71, 6 +%135 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %134, i32 %72, 7 +%136 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %135, i32 %73, 8 +%137 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %136, i32 %74, 9 +%138 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %137, i32 %75, 10 +%139 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %138, i32 %76, 11 +%140 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %139, i32 %77, 12 +%141 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %140, i32 %78, 13 +%142 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %141, i32 %79, 14 +%143 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %142, i32 %80, 15 +%144 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %143, i32 %81, 16 +%145 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %144, i32 %82, 17 +%146 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %145, i32 %83, 18 +%147 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %146, i32 %84, 19 +%148 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %147, i32 %85, 20 +%149 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %148, i32 %86, 21 +%150 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %149, i32 %87, 22 +%151 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %150, i32 %88, 23 +%152 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %151, i32 %89, 24 +%153 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %152, i32 %90, 25 +%154 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %153, i32 %91, 26 +%155 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %154, i32 %92, 27 +%156 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %155, i32 %93, 28 +%157 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %156, i32 %94, 29 +%158 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %157, i32 %95, 30 +%159 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %158, i32 %96, 31 +%160 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %159, i32 %97, 32 +%161 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %160, i32 %98, 33 +%162 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %161, i32 %99, 34 +%163 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %162, i32 %100, 35 +%164 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %163, i32 %101, 36 +%165 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %164, i32 %102, 37 +%166 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %165, i32 %103, 38 +%167 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %166, i32 %104, 39 +%168 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %167, i32 %105, 40 +%169 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %168, i32 %106, 41 +%170 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %169, i32 %107, 42 +%171 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %170, i32 %108, 43 +%172 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %171, i32 %109, 44 +%173 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %172, i32 %110, 45 +%174 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %173, i32 %111, 46 +%175 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %174, i32 %112, 47 +%176 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %175, i32 %113, 48 +%177 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %176, i32 %114, 49 +%178 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %177, i32 %115, 50 +%179 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %178, i32 %116, 51 +%180 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %179, i32 %117, 52 +%181 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %180, i32 %118, 53 +%182 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %181, i32 %119, 54 +%183 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %182, i32 %120, 55 +%184 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %183, i32 %121, 56 +%185 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %184, i32 %122, 57 +%186 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %185, i32 %123, 58 +%187 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %186, i32 %124, 59 +%188 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %187, i32 %125, 60 +%189 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %188, i32 %126, 61 +%190 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %189, i32 %127, 62 + ret { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %190 +} + +; CHECK: s_xor_b64 s[0:1], s[0:1], s[2:3] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[4:5] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[6:7] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[8:9] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[10:11] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[12:13] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[14:15] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[16:17] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[18:19] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[20:21] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[22:23] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[24:25] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[26:27] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[28:29] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[30:31] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[32:33] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[34:35] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[36:37] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[38:39] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[40:41] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[42:43] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[44:45] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[46:47] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[48:49] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[50:51] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[52:53] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[54:55] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[56:57] +; CHECK: s_xor_b64 s[0:1], s[0:1], s[58:59] +define amdgpu_gs void @_amdgpu_gs_sgpr_limit_i64 (i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, <4 x i32> inreg %addr) { +.entry: + %31 = xor i64 %0, %1 + %32 = xor i64 %31, %2 + %33 = xor i64 %32, %3 + %34 = xor i64 %33, %4 + %35 = xor i64 %34, %5 + %36 = xor i64 %35, %6 + %37 = xor i64 %36, %7 + %38 = xor i64 %37, %8 + %39 = xor i64 %38, %9 + %40 = xor i64 %39, %10 + %41 = xor i64 %40, %11 + %42 = xor i64 %41, %12 + %43 = xor i64 %42, %13 + %44 = xor i64 %43, %14 + %45 = xor i64 %44, %15 + %46 = xor i64 %45, %16 + %47 = xor i64 %46, %17 + %48 = xor i64 %47, %18 + %49 = xor i64 %48, %19 + %50 = xor i64 %49, %20 + %51 = xor i64 %50, %21 + %52 = xor i64 %51, %22 + %53 = xor i64 %52, %23 + %54 = xor i64 %53, %24 + %55 = xor i64 %54, %25 + %56 = xor i64 %55, %26 + %57 = xor i64 %56, %27 + %58 = xor i64 %57, %28 + %59 = xor i64 %58, %29 + %60 = bitcast i64 %59 to <2 x i32> + call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %60, <4 x i32> %addr, i32 4, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32) + From 1149c3500bfa3a386a3d45f0913a1c827e21be48 Mon Sep 17 00:00:00 2001 From: Hiroshi Yamauchi Date: Wed, 15 May 2019 15:15:16 +0000 Subject: [PATCH 18/19] [JumpThreading] A bug fix for stale loop info after unfold select Summary: The return value of a TryToUnfoldSelect call was not checked, which led to an incorrectly preserved loop info and some crash. The original crash was reported on https://reviews.llvm.org/D59514. Reviewers: davidxl, amehsan Reviewed By: davidxl Subscribers: fhahn, brzycki, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61920 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360780 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/JumpThreading.cpp | 3 +- .../stale-loop-info-after-unfold-select.ll | 30 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 test/Transforms/JumpThreading/stale-loop-info-after-unfold-select.ll diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 123c8b9630cb..7cb955d03ff4 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -1174,7 +1174,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { } if (SwitchInst *SI = dyn_cast(BB->getTerminator())) - TryToUnfoldSelect(SI, BB); + if (TryToUnfoldSelect(SI, BB)) + return true; // Check for some cases that are worth simplifying. Right now we want to look // for loads that are used by a switch or by the condition for the branch. If diff --git a/test/Transforms/JumpThreading/stale-loop-info-after-unfold-select.ll b/test/Transforms/JumpThreading/stale-loop-info-after-unfold-select.ll new file mode 100644 index 000000000000..7dbc794c3639 --- /dev/null +++ b/test/Transforms/JumpThreading/stale-loop-info-after-unfold-select.ll @@ -0,0 +1,30 @@ +; RUN: opt -passes='require,jump-threading,verify' -S < %s + +%"type1" = type { i8 } +%"type2" = type opaque + +define dso_local i16* @func2(%"type1"* %this, %"type2"*) { +entry: + br label %while.cond + +while.cond: ; preds = %func1.exit, %entry + %month.0 = phi i32 [ undef, %entry ], [ %month.0.be, %func1.exit ] + switch i32 %month.0, label %if.end.i [ + i32 4, label %func1.exit + i32 1, label %func1.exit + ] + +if.end.i: ; preds = %while.cond + br label %func1.exit + +func1.exit: ; preds = %if.end.i, %while.cond, %while.cond + %retval.0.i = phi i32 [ 9, %if.end.i ], [ 0, %while.cond ], [ 0, %while.cond ] + %call2 = tail call signext i32 @func3(i32 signext %retval.0.i, i32 signext 1, i32 signext 3) + %cmp = icmp slt i32 %call2, 1 + %add = add nsw i32 %call2, 2 + %month.0.be = select i1 %cmp, i32 %add, i32 %call2 + br label %while.cond +} + +declare i32 @func3(i32, i32, i32) + From 7c3b5ade5e9237a4e50084597b9cdde5ad5165d8 Mon Sep 17 00:00:00 2001 From: Thomas Preud'homme Date: Wed, 15 May 2019 15:20:45 +0000 Subject: [PATCH 19/19] [FileCheck] Fix sphinx error: Make input be gas block Summary: Change example of input text from being llvm block to being gas block since that text is made-up assembly. Reviewers: jhenderson, jdenny, probinson, arichardson Subscribers: llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61893 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360781 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/CommandGuide/FileCheck.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst index 55c7f31f63b8..bc216be35df6 100644 --- a/docs/CommandGuide/FileCheck.rst +++ b/docs/CommandGuide/FileCheck.rst @@ -593,13 +593,13 @@ For example: The above example would match the line: -.. code-block:: llvm +.. code-block:: gas add r5, r5, r6 but would not match the line: -.. code-block:: llvm +.. code-block:: gas add r5, r5, r7