Skip to content

Commit

Permalink
Revert "Port yield normalization from CoreCLR to Native AOT (#103675)"
Browse files Browse the repository at this point in the history
This reverts commit d35f302.
  • Loading branch information
MichalStrehovsky committed Jul 26, 2024
1 parent f6b4960 commit 45de37b
Show file tree
Hide file tree
Showing 12 changed files with 653 additions and 390 deletions.
6 changes: 6 additions & 0 deletions src/coreclr/gc/env/gcenv.os.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@
#ifndef __GCENV_OS_H__
#define __GCENV_OS_H__

#ifdef HAS_SYSTEM_YIELDPROCESSOR
// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC.
#undef YieldProcessor
#define YieldProcessor System_YieldProcessor
#endif

#define NUMA_NODE_UNDEFINED UINT16_MAX

bool ParseIndexOrRange(const char** config_string, size_t* start_index, size_t* end_index);
Expand Down
39 changes: 21 additions & 18 deletions src/coreclr/inc/yieldprocessornormalized.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@

#pragma once

#ifdef FEATURE_NATIVEAOT
FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
#else
// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
// the intention is to use the system-default implementation of YieldProcessor().
#define HAS_SYSTEM_YIELDPROCESSOR
FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
#ifdef YieldProcessor
#undef YieldProcessor
#endif
#define YieldProcessor Dont_Use_YieldProcessor

#define DISABLE_COPY(T) \
T(const T &) = delete; \
Expand Down Expand Up @@ -141,17 +144,17 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
{
_ASSERTE(count != 0);

if (sizeof(size_t) <= sizeof(unsigned int))
if (sizeof(SIZE_T) <= sizeof(unsigned int))
{
// On platforms with a small size_t, prevent overflow on the multiply below
// On platforms with a small SIZE_T, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (count > MaxCount)
{
count = MaxCount;
}
}

size_t n = (size_t)count * normalizationInfo.yieldsPerNormalizedYield;
SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
_ASSERTE(n != 0);
do
{
Expand Down Expand Up @@ -186,9 +189,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
{
_ASSERTE(preSkylakeCount != 0);

if (sizeof(size_t) <= sizeof(unsigned int))
if (sizeof(SIZE_T) <= sizeof(unsigned int))
{
// On platforms with a small size_t, prevent overflow on the multiply below
// On platforms with a small SIZE_T, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (preSkylakeCount > MaxCount)
{
Expand All @@ -197,7 +200,7 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
}

const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
size_t n = (size_t)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
if (n == 0)
{
n = 1;
Expand All @@ -224,9 +227,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl

_ASSERTE(preSkylakeCount != 0);

if (sizeof(size_t) <= sizeof(unsigned int))
if (sizeof(SIZE_T) <= sizeof(unsigned int))
{
// On platforms with a small size_t, prevent overflow on the multiply below
// On platforms with a small SIZE_T, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (preSkylakeCount > MaxCount)
{
Expand All @@ -235,8 +238,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
}

const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
size_t n =
(size_t)preSkylakeCount *
SIZE_T n =
(SIZE_T)preSkylakeCount *
YieldProcessorNormalization::s_yieldsPerNormalizedYield /
PreSkylakeCountToSkylakeCountDivisor;
if (n == 0)
Expand Down Expand Up @@ -265,11 +268,11 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
unsigned int spinIteration)
{
// This shift value should be adjusted based on the asserted conditions below
const uint8_t MaxShift = 3;
static_assert(
((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
static_assert(
((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
const UINT8 MaxShift = 3;
static_assert_no_msg(
((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
static_assert_no_msg(
((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);

unsigned int n;
if (spinIteration <= MaxShift &&
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/nativeaot/Runtime/Crst.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ enum CrstType
CrstRestrictedCallouts,
CrstGcStressControl,
CrstThreadStore,
CrstYieldProcessorNormalized,
CrstEventPipe,
CrstEventPipeConfig,
CrstGcEvent,
Expand Down
8 changes: 3 additions & 5 deletions src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ uint32_t WINAPI FinalizerStart(void* pContext)

g_pFinalizerThread = PTR_Thread(pThread);

// We have some time until the first finalization request - use the time to calibrate normalized waits.
EnsureYieldProcessorNormalizedInitialized();

// Wait for a finalization request.
uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE);
ASSERT(uResult == WAIT_OBJECT_0);
Expand Down Expand Up @@ -181,11 +184,6 @@ EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount)
{
FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());
g_FinalizerDoneEvent.Set();

if (YieldProcessorNormalization::IsMeasurementScheduled())
{
YieldProcessorNormalization::PerformMeasurement();
}
}

//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,4 +113,3 @@ ThreadPoolWorkingThreadCount
ThreadRunning
WaitHandleWaitStart
WaitHandleWaitStop
YieldProcessorMeasurement
2 changes: 2 additions & 0 deletions src/coreclr/nativeaot/Runtime/startup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ static bool InitDLL(HANDLE hPalInstance)
#endif
#endif // !USE_PORTABLE_HELPERS

InitializeYieldProcessorNormalizedCrst();

#ifdef STRESS_LOG
uint32_t dwTotalStressLogSize = (uint32_t)g_pRhConfig->GetTotalStressLogSize();
uint32_t dwStressLogLevel = (uint32_t)g_pRhConfig->GetStressLogLevel();
Expand Down
20 changes: 0 additions & 20 deletions src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,26 +56,6 @@ FORCEINLINE int64_t PalInterlockedCompareExchange64(_Inout_ int64_t volatile *pD
return _InterlockedCompareExchange64(pDst, iValue, iComparand);
}

#ifdef HOST_X86
FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
{
int64_t iOldValue;
do {
iOldValue = *pDst;
} while (PalInterlockedCompareExchange64(pDst,
iValue,
iOldValue) != iOldValue);
return iOldValue;
}
#else // HOST_X86
EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t);
#pragma intrinsic(_InterlockedExchange64)
FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
{
return _InterlockedExchange64(pDst, iValue);
}
#endif // HOST_X86

#if defined(HOST_AMD64) || defined(HOST_ARM64)
EXTERN_C uint8_t _InterlockedCompareExchange128(int64_t volatile *, int64_t, int64_t, int64_t *);
#pragma intrinsic(_InterlockedCompareExchange128)
Expand Down
102 changes: 100 additions & 2 deletions src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,104 @@
#include "volatile.h"
#include "yieldprocessornormalized.h"

#include "../../utilcode/yieldprocessornormalized.cpp"
#define ULONGLONG int64_t

#include "../../vm/yieldprocessornormalizedshared.cpp"
static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
static CrstStatic s_initializeYieldProcessorNormalizedCrst;

// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
// tuned for Skylake processors
unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;

void InitializeYieldProcessorNormalizedCrst()
{
WRAPPER_NO_CONTRACT;
s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized);
}

static void InitializeYieldProcessorNormalized()
{
WRAPPER_NO_CONTRACT;

CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);

if (s_isYieldProcessorNormalizedInitialized)
{
return;
}

// Intel pre-Skylake processor: measured typically 14-17 cycles per yield
// Intel post-Skylake processor: measured typically 125-150 cycles per yield
const int MeasureDurationMs = 10;
const int NsPerSecond = 1000 * 1000 * 1000;

ULONGLONG ticksPerSecond = PalQueryPerformanceFrequency();

if (ticksPerSecond < 1000 / MeasureDurationMs)
{
// High precision clock not available or clock resolution is too low, resort to defaults
s_isYieldProcessorNormalizedInitialized = true;
return;
}

// Measure the nanosecond delay per yield
ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
unsigned int yieldCount = 0;
ULONGLONG startTicks = PalQueryPerformanceCounter();
ULONGLONG elapsedTicks;
do
{
// On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
// the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
// low microsecond range.
for (int i = 0; i < 1000; ++i)
{
System_YieldProcessor();
}
yieldCount += 1000;

ULONGLONG nowTicks = PalQueryPerformanceCounter();
elapsedTicks = nowTicks - startTicks;
} while (elapsedTicks < measureDurationTicks);
double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
if (nsPerYield < 1)
{
nsPerYield = 1;
}

// Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
// value is naturally limited to MinNsPerNormalizedYield.
int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
if (yieldsPerNormalizedYield < 1)
{
yieldsPerNormalizedYield = 1;
}
_ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);

// Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
// spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
// better job of allowing other work to run.
int optimalMaxNormalizedYieldsPerSpinIteration =
(int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
{
optimalMaxNormalizedYieldsPerSpinIteration = 1;
}

g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
s_isYieldProcessorNormalizedInitialized = true;

GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
}

void EnsureYieldProcessorNormalizedInitialized()
{
WRAPPER_NO_CONTRACT;

if (!s_isYieldProcessorNormalizedInitialized)
{
InitializeYieldProcessorNormalized();
}
}
Loading

0 comments on commit 45de37b

Please sign in to comment.