Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[No review] Testing outerloop #105543

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/coreclr/gc/env/gcenv.os.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@
#ifndef __GCENV_OS_H__
#define __GCENV_OS_H__

#ifdef HAS_SYSTEM_YIELDPROCESSOR
// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC.
#undef YieldProcessor
#define YieldProcessor System_YieldProcessor
#endif

#define NUMA_NODE_UNDEFINED UINT16_MAX

bool ParseIndexOrRange(const char** config_string, size_t* start_index, size_t* end_index);
Expand Down
39 changes: 21 additions & 18 deletions src/coreclr/inc/yieldprocessornormalized.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@

#pragma once

#ifdef FEATURE_NATIVEAOT
FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
#else
// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
// the intention is to use the system-default implementation of YieldProcessor().
#define HAS_SYSTEM_YIELDPROCESSOR
FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
#ifdef YieldProcessor
#undef YieldProcessor
#endif
#define YieldProcessor Dont_Use_YieldProcessor

#define DISABLE_COPY(T) \
T(const T &) = delete; \
Expand Down Expand Up @@ -141,17 +144,17 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
{
_ASSERTE(count != 0);

if (sizeof(size_t) <= sizeof(unsigned int))
if (sizeof(SIZE_T) <= sizeof(unsigned int))
{
// On platforms with a small size_t, prevent overflow on the multiply below
// On platforms with a small SIZE_T, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (count > MaxCount)
{
count = MaxCount;
}
}

size_t n = (size_t)count * normalizationInfo.yieldsPerNormalizedYield;
SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
_ASSERTE(n != 0);
do
{
Expand Down Expand Up @@ -186,9 +189,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
{
_ASSERTE(preSkylakeCount != 0);

if (sizeof(size_t) <= sizeof(unsigned int))
if (sizeof(SIZE_T) <= sizeof(unsigned int))
{
// On platforms with a small size_t, prevent overflow on the multiply below
// On platforms with a small SIZE_T, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (preSkylakeCount > MaxCount)
{
Expand All @@ -197,7 +200,7 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
}

const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
size_t n = (size_t)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
if (n == 0)
{
n = 1;
Expand All @@ -224,9 +227,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl

_ASSERTE(preSkylakeCount != 0);

if (sizeof(size_t) <= sizeof(unsigned int))
if (sizeof(SIZE_T) <= sizeof(unsigned int))
{
// On platforms with a small size_t, prevent overflow on the multiply below
// On platforms with a small SIZE_T, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (preSkylakeCount > MaxCount)
{
Expand All @@ -235,8 +238,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
}

const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
size_t n =
(size_t)preSkylakeCount *
SIZE_T n =
(SIZE_T)preSkylakeCount *
YieldProcessorNormalization::s_yieldsPerNormalizedYield /
PreSkylakeCountToSkylakeCountDivisor;
if (n == 0)
Expand Down Expand Up @@ -265,11 +268,11 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
unsigned int spinIteration)
{
// This shift value should be adjusted based on the asserted conditions below
const uint8_t MaxShift = 3;
static_assert(
((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
static_assert(
((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
const UINT8 MaxShift = 3;
static_assert_no_msg(
((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
static_assert_no_msg(
((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);

unsigned int n;
if (spinIteration <= MaxShift &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ internal static extern unsafe IntPtr RhpCallPropagateExceptionCallback(

// Indicate that the current round of finalizations is complete.
[DllImport(Redhawk.BaseName)]
internal static extern void RhpSignalFinalizationComplete(uint fCount, int observedFullGcCount);
internal static extern void RhpSignalFinalizationComplete(uint fCount);

[DllImport(Redhawk.BaseName)]
internal static extern ulong RhpGetTickCount64();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,11 @@ public static void ProcessFinalizers()
// otherwise memory is low and we should initiate a collection.
if (InternalCalls.RhpWaitForFinalizerRequest() != 0)
{
int observedFullGcCount = RuntimeImports.RhGetGcCollectionCount(RuntimeImports.RhGetMaxGcGeneration(), false);
uint finalizerCount = DrainQueue();

// Anyone waiting to drain the Q can now wake up. Note that there is a
// race in that another thread starting a drain, as we leave a drain, may
// consider itself satisfied by the drain that just completed.
// Thus we include the Full GC count that we have certaily observed.
InternalCalls.RhpSignalFinalizationComplete(finalizerCount, observedFullGcCount);
// Tell anybody that's interested that the finalization pass is complete (there is a race condition here
// where we might immediately signal a new request as complete, but this is acceptable).
InternalCalls.RhpSignalFinalizationComplete(finalizerCount);
}
else
{
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/nativeaot/Runtime/Crst.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ enum CrstType
CrstRestrictedCallouts,
CrstGcStressControl,
CrstThreadStore,
CrstYieldProcessorNormalized,
CrstEventPipe,
CrstEventPipeConfig,
CrstGcEvent,
Expand Down
45 changes: 10 additions & 35 deletions src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ uint32_t WINAPI FinalizerStart(void* pContext)

g_pFinalizerThread = PTR_Thread(pThread);

// We have some time until the first finalization request - use the time to calibrate normalized waits.
EnsureYieldProcessorNormalizedInitialized();

// Wait for a finalization request.
uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE);
ASSERT(uResult == WAIT_OBJECT_0);
Expand Down Expand Up @@ -94,22 +97,6 @@ EXTERN_C void QCALLTYPE RhInitializeFinalizerThread()
g_FinalizerEvent.Set();
}

static int32_t g_fullGcCountSeenByFinalization;

// Indicate that the current round of finalizations is complete.
EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount, int32_t observedFullGcCount)
{
FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());

g_fullGcCountSeenByFinalization = observedFullGcCount;
g_FinalizerDoneEvent.Set();

if (YieldProcessorNormalization::IsMeasurementScheduled())
{
YieldProcessorNormalization::PerformMeasurement();
}
}

EXTERN_C void QCALLTYPE RhWaitForPendingFinalizers(UInt32_BOOL allowReentrantWait)
{
// This must be called via p/invoke rather than RuntimeImport since it blocks and could starve the GC if
Expand All @@ -119,32 +106,13 @@ EXTERN_C void QCALLTYPE RhWaitForPendingFinalizers(UInt32_BOOL allowReentrantWai
// Can't call this from the finalizer thread itself.
if (ThreadStore::GetCurrentThread() != g_pFinalizerThread)
{
// We may see a completion of finalization cycle that might not see objects that became
// F-reachable in recent GCs. In such case we want to wait for a completion of another cycle.
// However, since an object cannot be prevented from promoting, one can only rely on Full GCs
// to collect unreferenced objects deterministically. Thus we only care about Full GCs here.
int desiredFullGcCount =
GCHeapUtilities::GetGCHeap()->CollectionCount(GCHeapUtilities::GetGCHeap()->GetMaxGeneration());

tryAgain:
// Clear any current indication that a finalization pass is finished and wake the finalizer thread up
// (if there's no work to do it'll set the done event immediately).
g_FinalizerDoneEvent.Reset();
g_FinalizerEvent.Set();

// Wait for the finalizer thread to get back to us.
g_FinalizerDoneEvent.Wait(INFINITE, false, allowReentrantWait);

// we use unsigned math here as the collection counts, which are size_t internally,
// can in theory overflow an int and wrap around.
// unsigned math would have more defined/portable behavior in such case
if ((int)((unsigned int)desiredFullGcCount - (unsigned int)g_fullGcCountSeenByFinalization) > 0)
{
// There were some Full GCs happening before we started waiting and possibly not seen by the
// last finalization cycle. This is rare, but we need to be sure we have seen those,
// so we try one more time.
goto tryAgain;
}
}
}

Expand Down Expand Up @@ -211,6 +179,13 @@ EXTERN_C UInt32_BOOL QCALLTYPE RhpWaitForFinalizerRequest()
} while (true);
}

// Indicate that the current round of finalizations is complete.
EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount)
{
FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());
g_FinalizerDoneEvent.Set();
}

//
// The following helpers are special in that they interact with internal GC state or directly manipulate
// managed references so they're called with a special co-operative p/invoke.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,4 +113,3 @@ ThreadPoolWorkingThreadCount
ThreadRunning
WaitHandleWaitStart
WaitHandleWaitStop
YieldProcessorMeasurement
2 changes: 2 additions & 0 deletions src/coreclr/nativeaot/Runtime/startup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ static bool InitDLL(HANDLE hPalInstance)
#endif
#endif // !USE_PORTABLE_HELPERS

InitializeYieldProcessorNormalizedCrst();

#ifdef STRESS_LOG
uint32_t dwTotalStressLogSize = (uint32_t)g_pRhConfig->GetTotalStressLogSize();
uint32_t dwStressLogLevel = (uint32_t)g_pRhConfig->GetStressLogLevel();
Expand Down
20 changes: 0 additions & 20 deletions src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,26 +56,6 @@ FORCEINLINE int64_t PalInterlockedCompareExchange64(_Inout_ int64_t volatile *pD
return _InterlockedCompareExchange64(pDst, iValue, iComparand);
}

#ifdef HOST_X86
FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
{
int64_t iOldValue;
do {
iOldValue = *pDst;
} while (PalInterlockedCompareExchange64(pDst,
iValue,
iOldValue) != iOldValue);
return iOldValue;
}
#else // HOST_X86
EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t);
#pragma intrinsic(_InterlockedExchange64)
FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
{
return _InterlockedExchange64(pDst, iValue);
}
#endif // HOST_X86

#if defined(HOST_AMD64) || defined(HOST_ARM64)
EXTERN_C uint8_t _InterlockedCompareExchange128(int64_t volatile *, int64_t, int64_t, int64_t *);
#pragma intrinsic(_InterlockedCompareExchange128)
Expand Down
102 changes: 100 additions & 2 deletions src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,104 @@
#include "volatile.h"
#include "yieldprocessornormalized.h"

#include "../../utilcode/yieldprocessornormalized.cpp"
#define ULONGLONG int64_t

#include "../../vm/yieldprocessornormalizedshared.cpp"
static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
static CrstStatic s_initializeYieldProcessorNormalizedCrst;

// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
// tuned for Skylake processors
unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;

void InitializeYieldProcessorNormalizedCrst()
{
WRAPPER_NO_CONTRACT;
s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized);
}

static void InitializeYieldProcessorNormalized()
{
WRAPPER_NO_CONTRACT;

CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);

if (s_isYieldProcessorNormalizedInitialized)
{
return;
}

// Intel pre-Skylake processor: measured typically 14-17 cycles per yield
// Intel post-Skylake processor: measured typically 125-150 cycles per yield
const int MeasureDurationMs = 10;
const int NsPerSecond = 1000 * 1000 * 1000;

ULONGLONG ticksPerSecond = PalQueryPerformanceFrequency();

if (ticksPerSecond < 1000 / MeasureDurationMs)
{
// High precision clock not available or clock resolution is too low, resort to defaults
s_isYieldProcessorNormalizedInitialized = true;
return;
}

// Measure the nanosecond delay per yield
ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
unsigned int yieldCount = 0;
ULONGLONG startTicks = PalQueryPerformanceCounter();
ULONGLONG elapsedTicks;
do
{
// On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
// the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
// low microsecond range.
for (int i = 0; i < 1000; ++i)
{
System_YieldProcessor();
}
yieldCount += 1000;

ULONGLONG nowTicks = PalQueryPerformanceCounter();
elapsedTicks = nowTicks - startTicks;
} while (elapsedTicks < measureDurationTicks);
double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
if (nsPerYield < 1)
{
nsPerYield = 1;
}

// Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
// value is naturally limited to MinNsPerNormalizedYield.
int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
if (yieldsPerNormalizedYield < 1)
{
yieldsPerNormalizedYield = 1;
}
_ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);

// Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
// spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
// better job of allowing other work to run.
int optimalMaxNormalizedYieldsPerSpinIteration =
(int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
{
optimalMaxNormalizedYieldsPerSpinIteration = 1;
}

g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
s_isYieldProcessorNormalizedInitialized = true;

GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
}

void EnsureYieldProcessorNormalizedInitialized()
{
WRAPPER_NO_CONTRACT;

if (!s_isYieldProcessorNormalizedInitialized)
{
InitializeYieldProcessorNormalized();
}
}
Loading
Loading