Skip to content

Commit

Permalink
JIT: more likelihood checking (dotnet#99541)
Browse files Browse the repository at this point in the history
Check has likelihood and likelihood sum consistency all the way until
rationalize.

Several phases required a bit of revision, notably the multi-guess type
test expansion (similar to GDV) and optimize bools.

Fix carry-over issue from previous round where block and edge scaling
in loop cloning used different factors.

Contributes to dotnet#93020
  • Loading branch information
AndyAyersMS authored and janvorli committed Mar 12, 2024
1 parent 4983d9e commit 58ac7be
Show file tree
Hide file tree
Showing 19 changed files with 375 additions and 125 deletions.
2 changes: 2 additions & 0 deletions src/coreclr/inc/eetwain.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ enum ICodeManagerFlags
// any untracked slots

LightUnwind = 0x0100, // Unwind just enough to get return addresses
ReportFPBasedSlotsOnly
= 0x0200,
};

//*****************************************************************************
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/inc/gcinfodecoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -679,7 +679,7 @@ class GcInfoDecoder
if(slotIndex < slotDecoder.GetNumRegisters())
{
UINT32 regNum = pSlot->Slot.RegisterNumber;
if( reportScratchSlots || !IsScratchRegister( regNum, pRD ) )
if( (reportScratchSlots || !IsScratchRegister( regNum, pRD )) )
{
ReportRegisterToGC(
regNum,
Expand Down
10 changes: 5 additions & 5 deletions src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4850,11 +4850,6 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
DoPhase(this, PHASE_COMPUTE_DOMINATORS, &Compiler::fgComputeDominators);
}

// Disable profile checks now.
// Over time we will move this further and further back in the phase list, as we fix issues.
//
activePhaseChecks &= ~PhaseChecks::CHECK_PROFILE;

#ifdef DEBUG
fgDebugCheckLinks();
#endif
Expand Down Expand Up @@ -5135,6 +5130,11 @@ void Compiler::compCompile(void** methodCodePtr, uint32_t* methodCodeSize, JitFl
}
#endif // TARGET_ARM

// Disable profile checks now.
// Over time we will move this further and further back in the phase list, as we fix issues.
//
activePhaseChecks &= ~PhaseChecks::CHECK_PROFILE;

// Assign registers to variables, etc.

// Create LinearScan before Lowering, so that Lowering can call LinearScan methods
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -6807,7 +6807,7 @@ class Compiler
public:
PhaseStatus optOptimizeBools();
PhaseStatus optSwitchRecognition();
bool optSwitchConvert(BasicBlock* firstBlock, int testsCount, ssize_t* testValues, GenTree* nodeToTest);
bool optSwitchConvert(BasicBlock* firstBlock, int testsCount, ssize_t* testValues, weight_t falseLikelihood, GenTree* nodeToTest);
bool optSwitchDetectAndConvert(BasicBlock* firstBlock);

PhaseStatus optInvertLoops(); // Invert loops so they're entered at top and tested at bottom.
Expand Down
10 changes: 6 additions & 4 deletions src/coreclr/jit/fgbasic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5420,13 +5420,15 @@ BasicBlock* Compiler::fgConnectFallThrough(BasicBlock* bSrc, BasicBlock* bDst)
if (bSrc->KindIs(BBJ_COND) && bSrc->FalseTargetIs(bDst) && !bSrc->NextIs(bDst))
{
// Add a new block after bSrc which jumps to 'bDst'
jmpBlk = fgNewBBafter(BBJ_ALWAYS, bSrc, true);
FlowEdge* oldEdge = bSrc->GetFalseEdge();
jmpBlk = fgNewBBafter(BBJ_ALWAYS, bSrc, true);
FlowEdge* const oldEdge = bSrc->GetFalseEdge();
// Access the likelihood of oldEdge before
// it gets reset by SetTargetEdge below.
//
FlowEdge* const newEdge = fgAddRefPred(jmpBlk, bSrc, oldEdge);
fgReplacePred(oldEdge, jmpBlk);
jmpBlk->SetTargetEdge(oldEdge);
assert(jmpBlk->TargetIs(bDst));

FlowEdge* newEdge = fgAddRefPred(jmpBlk, bSrc, oldEdge);
bSrc->SetFalseEdge(newEdge);

// When adding a new jmpBlk we will set the bbWeight and bbFlags
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/flowgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,8 @@ BasicBlock* Compiler::fgCreateGCPoll(GCPollType pollType, BasicBlock* block)
// Bottom has Top and Poll as its predecessors. Poll has just Top as a predecessor.
FlowEdge* const trueEdge = fgAddRefPred(bottom, top);
FlowEdge* const falseEdge = fgAddRefPred(poll, top);
trueEdge->setLikelihood(1.0);
falseEdge->setLikelihood(0.0);

FlowEdge* const newEdge = fgAddRefPred(bottom, poll);
poll->SetTargetEdge(newEdge);
Expand Down
106 changes: 93 additions & 13 deletions src/coreclr/jit/helperexpansion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,8 @@ bool Compiler::fgExpandRuntimeLookupsForCall(BasicBlock** pBlock, Statement* stm
FlowEdge* const falseEdge = fgAddRefPred(nullcheckBb, sizeCheckBb);
sizeCheckBb->SetTrueEdge(trueEdge);
sizeCheckBb->SetFalseEdge(falseEdge);
trueEdge->setLikelihood(0.2);
falseEdge->setLikelihood(0.8);
}

// fallbackBb is reachable from both nullcheckBb and sizeCheckBb
Expand All @@ -414,10 +416,13 @@ bool Compiler::fgExpandRuntimeLookupsForCall(BasicBlock** pBlock, Statement* stm
FlowEdge* const falseEdge = fgAddRefPred(fastPathBb, nullcheckBb);
nullcheckBb->SetTrueEdge(trueEdge);
nullcheckBb->SetFalseEdge(falseEdge);
trueEdge->setLikelihood(0.2);
falseEdge->setLikelihood(0.8);

//
// Re-distribute weights (see '[weight: X]' on the diagrams above)
// TODO: consider marking fallbackBb as rarely-taken
// TODO: derive block weights from edge likelihoods.
//
block->inheritWeight(prevBb);
if (needsSizeCheck)
Expand Down Expand Up @@ -720,6 +725,8 @@ bool Compiler::fgExpandThreadLocalAccessForCallNativeAOT(BasicBlock** pBlock, St
FlowEdge* const falseEdge = fgAddRefPred(fallbackBb, tlsRootNullCondBB);
tlsRootNullCondBB->SetTrueEdge(trueEdge);
tlsRootNullCondBB->SetFalseEdge(falseEdge);
trueEdge->setLikelihood(1.0);
falseEdge->setLikelihood(0.0);

{
FlowEdge* const newEdge = fgAddRefPred(block, fallbackBb);
Expand Down Expand Up @@ -1091,13 +1098,17 @@ bool Compiler::fgExpandThreadLocalAccessForCall(BasicBlock** pBlock, Statement*
FlowEdge* const falseEdge = fgAddRefPred(threadStaticBlockNullCondBB, maxThreadStaticBlocksCondBB);
maxThreadStaticBlocksCondBB->SetTrueEdge(trueEdge);
maxThreadStaticBlocksCondBB->SetFalseEdge(falseEdge);
trueEdge->setLikelihood(0.0);
falseEdge->setLikelihood(1.0);
}

{
FlowEdge* const trueEdge = fgAddRefPred(fastPathBb, threadStaticBlockNullCondBB);
FlowEdge* const falseEdge = fgAddRefPred(fallbackBb, threadStaticBlockNullCondBB);
threadStaticBlockNullCondBB->SetTrueEdge(trueEdge);
threadStaticBlockNullCondBB->SetFalseEdge(falseEdge);
trueEdge->setLikelihood(1.0);
falseEdge->setLikelihood(0.0);
}

{
Expand Down Expand Up @@ -1471,6 +1482,8 @@ bool Compiler::fgExpandStaticInitForCall(BasicBlock** pBlock, Statement* stmt, G
FlowEdge* const falseEdge = fgAddRefPred(helperCallBb, isInitedBb);
isInitedBb->SetTrueEdge(trueEdge);
isInitedBb->SetFalseEdge(falseEdge);
trueEdge->setLikelihood(1.0);
falseEdge->setLikelihood(0.0);
}

//
Expand Down Expand Up @@ -1700,7 +1713,7 @@ bool Compiler::fgVNBasedIntrinsicExpansionForCall_ReadUtf8(BasicBlock** pBlock,
//
// Block 1: lengthCheckBb (we check that dstLen < srcLen)
//
BasicBlock* lengthCheckBb = fgNewBBafter(BBJ_COND, prevBb, true);
BasicBlock* const lengthCheckBb = fgNewBBafter(BBJ_COND, prevBb, true);
lengthCheckBb->SetFlags(BBF_INTERNAL);

// Set bytesWritten -1 by default, if the fast path is not taken we'll return it as the result.
Expand Down Expand Up @@ -1786,6 +1799,10 @@ bool Compiler::fgVNBasedIntrinsicExpansionForCall_ReadUtf8(BasicBlock** pBlock,
FlowEdge* const falseEdge = fgAddRefPred(fastpathBb, lengthCheckBb);
lengthCheckBb->SetTrueEdge(trueEdge);
lengthCheckBb->SetFalseEdge(falseEdge);

// review: we assume length check always succeeds??
trueEdge->setLikelihood(1.0);
falseEdge->setLikelihood(0.0);
}

{
Expand Down Expand Up @@ -2489,46 +2506,107 @@ bool Compiler::fgLateCastExpansionForCall(BasicBlock** pBlock, Statement* stmt,

fgRedirectTargetEdge(firstBb, nullcheckBb);

// We assume obj is 50%/50% null/not-null (TODO-InlineCast: rely on PGO)
// and rely on profile for the slow path.
//
// Alternatively we could profile nulls in the reservoir sample and
// treat that as another "option".
//
// True out of the null check means obj is null.
//
const weight_t nullcheckTrueLikelihood = 0.5;
const weight_t nullcheckFalseLikelihood = 0.5;

{
FlowEdge* const trueEdge = fgAddRefPred(lastBb, nullcheckBb);
nullcheckBb->SetTrueEdge(trueEdge);
trueEdge->setLikelihood(nullcheckTrueLikelihood);
}

if (typeCheckNotNeeded)
{
FlowEdge* const falseEdge = fgAddRefPred(fallbackBb, nullcheckBb);
nullcheckBb->SetFalseEdge(falseEdge);
falseEdge->setLikelihood(nullcheckFalseLikelihood);
}
else
{
FlowEdge* const falseEdge = fgAddRefPred(typeChecksBbs[0], nullcheckBb);
nullcheckBb->SetFalseEdge(falseEdge);
falseEdge->setLikelihood(nullcheckFalseLikelihood);

FlowEdge* const newEdge = fgAddRefPred(lastBb, typeCheckSucceedBb);
typeCheckSucceedBb->SetTargetEdge(newEdge);
}

//
// Re-distribute weights
// Re-distribute weights and set edge likelihoods.
//
// We have the likelihood estimate for each class to test against.
// As with multi-guess GDV, once we've failed the first check, the
// likelihood of the other checks will increase.
//
// For instance, suppose we have 3 classes A, B, C, with likelihoods 0.5, 0.2, 0.1,
// and a residual 0.2 likelihood for none of the above.
//
// We first test for A, this is a 0.5 likelihood of success.
//
// If the test for A fails, we test for B. Because we only reach this second
// test with relative likelihood 0.5, we need to divide (scale up) the remaining
// likelihoods by 0.5, so we end up with 0.4, 0.2, (none of the above: 0.4).
//
// If the test for B also fails, we test for C. Because we only reach
// this second test with relative likelihood 0.6, we need to divide (scale up)
// the remaining likelihoods by by 0.6, so we end up with 0.33, (none of the above: 0.67).
//
// In the code below, instead of updating all the likelihoods each time,
// we remember how much we need to divide by to fix the next likelihood. This divisor is
// 1.0 - (sumOfPreviousLikelihoods)
//
// So for the above, we have
//
// Test | Likelihood | Sum of Previous | Rel. Likelihood
// A | 0.5 | 0.0 | 0.5 / (1 - 0.0) = 0.50
// B | 0.2 | 0.5 | 0.2 / (1 - 0.5) = 0.40
// C | 0.1 | 0.7 | 0.1 / (1 - 0.7) = 0.33
// n/a | 0.2 | 0.8 | 0.2 / (1 - 0.8) = 1.00
//
// The same goes for inherited weights -- the block where we test for B will have
// the weight of A times the likelihood that A's test fails, etc.
//
nullcheckBb->inheritWeight(firstBb);
unsigned totalLikelihood = 0;
weight_t sumOfPreviousLikelihood = 0;
for (int candidateId = 0; candidateId < numOfCandidates; candidateId++)
{
unsigned likelihood = likelihoods[candidateId];
BasicBlock* curTypeCheckBb = typeChecksBbs[candidateId];
if (candidateId == 0)
{
// We assume obj is 50%/50% null/not-null (TODO-InlineCast: rely on PGO)
// and rely on profile for the slow path.
curTypeCheckBb->inheritWeightPercentage(nullcheckBb, 50);
// Predecessor is the nullcheck, control reaches on false.
//
curTypeCheckBb->inheritWeight(nullcheckBb);
curTypeCheckBb->scaleBBWeight(nullcheckBb->GetFalseEdge()->getLikelihood());
}
else
{
BasicBlock* prevTypeCheckBb = typeChecksBbs[candidateId - 1];
curTypeCheckBb->inheritWeightPercentage(prevTypeCheckBb, likelihood);
// Predecessor is the prior type check, control reaches on false.
//
BasicBlock* const prevTypeCheckBb = typeChecksBbs[candidateId - 1];
weight_t prevCheckFailedLikelihood = prevTypeCheckBb->GetFalseEdge()->getLikelihood();
curTypeCheckBb->inheritWeight(prevTypeCheckBb);
curTypeCheckBb->scaleBBWeight(prevCheckFailedLikelihood);
}
totalLikelihood += likelihood;

// Fix likelihood of block's outgoing edges.
//
weight_t likelihood = (weight_t)likelihoods[candidateId] / 100;
weight_t relLikelihood = likelihood / (1.0 - sumOfPreviousLikelihood);

JITDUMP("Candidate %d: likelihood " FMT_WT " relative likelihood " FMT_WT "\n", candidateId, likelihood,
relLikelihood);

curTypeCheckBb->GetTrueEdge()->setLikelihood(relLikelihood);
curTypeCheckBb->GetFalseEdge()->setLikelihood(1.0 - relLikelihood);
sumOfPreviousLikelihood += likelihood;
}

if (fallbackBb->KindIs(BBJ_THROW))
Expand All @@ -2540,13 +2618,15 @@ bool Compiler::fgLateCastExpansionForCall(BasicBlock** pBlock, Statement* stmt,
assert(fallbackBb->KindIs(BBJ_ALWAYS));
FlowEdge* const newEdge = fgAddRefPred(lastBb, fallbackBb);
fallbackBb->SetTargetEdge(newEdge);

fallbackBb->inheritWeightPercentage(lastTypeCheckBb, 100 - totalLikelihood);
fallbackBb->inheritWeight(lastTypeCheckBb);
weight_t lastTypeCheckFailedLikelihood = lastTypeCheckBb->GetFalseEdge()->getLikelihood();
fallbackBb->scaleBBWeight(lastTypeCheckFailedLikelihood);
}

if (!typeCheckNotNeeded)
{
typeCheckSucceedBb->inheritWeightPercentage(typeChecksBbs[0], totalLikelihood);
typeCheckSucceedBb->inheritWeight(typeChecksBbs[0]);
typeCheckSucceedBb->scaleBBWeight(sumOfPreviousLikelihood);
}

lastBb->inheritWeight(firstBb);
Expand Down
18 changes: 4 additions & 14 deletions src/coreclr/jit/loopcloning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -859,10 +859,7 @@ BasicBlock* LoopCloneContext::CondToStmtInBlock(Compiler*
// For "normal" cloning this is probably ok. For GDV cloning this
// may be inaccurate. We should key off the type test likelihood(s).
//
// TODO: this is a bit of out sync with what we do for block weights.
// Reconcile.
//
const weight_t fastLikelihood = 0.999;
const weight_t fastLikelihood = fastPathWeightScaleFactor;

// Choose how to generate the conditions
const bool generateOneConditionPerBlock = true;
Expand Down Expand Up @@ -1961,13 +1958,6 @@ void Compiler::optCloneLoop(FlowGraphNaturalLoop* loop, LoopCloneContext* contex
// taking the max with the head block's weight.
ambientWeight = max(ambientWeight, preheader->bbWeight);

// We assume that the fast path will run 99% of the time, and thus should get 99% of the block weights.
// The slow path will, correspondingly, get only 1% of the block weights. It could be argued that we should
// mark the slow path as "run rarely", since it really shouldn't execute (given the currently optimized loop
// conditions) except under exceptional circumstances.
const weight_t fastPathWeightScaleFactor = 0.99;
const weight_t slowPathWeightScaleFactor = 1.0 - fastPathWeightScaleFactor;

// We're going to transform this loop:
//
// preheader --> header
Expand Down Expand Up @@ -2023,18 +2013,18 @@ void Compiler::optCloneLoop(FlowGraphNaturalLoop* loop, LoopCloneContext* contex
BasicBlock* slowPreheader = fgNewBBafter(BBJ_ALWAYS, newPred, /*extendRegion*/ true);
JITDUMP("Adding " FMT_BB " after " FMT_BB "\n", slowPreheader->bbNum, newPred->bbNum);
slowPreheader->bbWeight = newPred->isRunRarely() ? BB_ZERO_WEIGHT : ambientWeight;
slowPreheader->scaleBBWeight(slowPathWeightScaleFactor);
slowPreheader->scaleBBWeight(LoopCloneContext::slowPathWeightScaleFactor);
newPred = slowPreheader;

// Now we'll clone the blocks of the loop body. These cloned blocks will be the slow path.

BlockToBlockMap* blockMap = new (getAllocator(CMK_LoopClone)) BlockToBlockMap(getAllocator(CMK_LoopClone));

loop->Duplicate(&newPred, blockMap, slowPathWeightScaleFactor);
loop->Duplicate(&newPred, blockMap, LoopCloneContext::slowPathWeightScaleFactor);

// Scale old blocks to the fast path weight.
loop->VisitLoopBlocks([=](BasicBlock* block) {
block->scaleBBWeight(fastPathWeightScaleFactor);
block->scaleBBWeight(LoopCloneContext::fastPathWeightScaleFactor);
return BasicBlockVisit::Continue;
});

Expand Down
8 changes: 8 additions & 0 deletions src/coreclr/jit/loopcloning.h
Original file line number Diff line number Diff line change
Expand Up @@ -814,6 +814,14 @@ struct NaturalLoopIterInfo;
*/
struct LoopCloneContext
{
// We assume that the fast path will run 99% of the time, and thus should get 99% of the block weights.
// The slow path will, correspondingly, get only 1% of the block weights. It could be argued that we should
// mark the slow path as "run rarely", since it really shouldn't execute (given the currently optimized loop
// conditions) except under exceptional circumstances.
//
static constexpr weight_t fastPathWeightScaleFactor = 0.99;
static constexpr weight_t slowPathWeightScaleFactor = 1.0 - fastPathWeightScaleFactor;

CompAllocator alloc; // The allocator

// The array of optimization opportunities found in each loop. (loop x optimization-opportunities)
Expand Down
Loading

0 comments on commit 58ac7be

Please sign in to comment.