dotnet · MichalStrehovsky · Jul 26, 2024 · Jul 26, 2024
diff --git a/src/coreclr/gc/env/gcenv.os.h b/src/coreclr/gc/env/gcenv.os.h
@@ -6,6 +6,12 @@
 #ifndef __GCENV_OS_H__
 #define __GCENV_OS_H__
 
+#ifdef HAS_SYSTEM_YIELDPROCESSOR
+// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC.
+#undef YieldProcessor
+#define YieldProcessor System_YieldProcessor
+#endif
+
 #define NUMA_NODE_UNDEFINED UINT16_MAX
 
 bool ParseIndexOrRange(const char** config_string, size_t* start_index, size_t* end_index);

diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
@@ -3,11 +3,14 @@
 
 #pragma once
 
-#ifdef FEATURE_NATIVEAOT
-FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
-#else
+// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
+// the intention is to use the system-default implementation of YieldProcessor().
+#define HAS_SYSTEM_YIELDPROCESSOR
 FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
+#ifdef YieldProcessor
+#undef YieldProcessor
 #endif
+#define YieldProcessor Dont_Use_YieldProcessor
 
 #define DISABLE_COPY(T) \
     T(const T &) = delete; \
@@ -141,17 +144,17 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
 {
     _ASSERTE(count != 0);
 
-    if (sizeof(size_t) <= sizeof(unsigned int))
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
     {
-        // On platforms with a small size_t, prevent overflow on the multiply below
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (count > MaxCount)
         {
             count = MaxCount;
         }
     }
 
-    size_t n = (size_t)count * normalizationInfo.yieldsPerNormalizedYield;
+    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
     _ASSERTE(n != 0);
     do
     {
@@ -186,9 +189,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
 {
     _ASSERTE(preSkylakeCount != 0);
 
-    if (sizeof(size_t) <= sizeof(unsigned int))
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
     {
-        // On platforms with a small size_t, prevent overflow on the multiply below
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (preSkylakeCount > MaxCount)
         {
@@ -197,7 +200,7 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
     }
 
     const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
-    size_t n = (size_t)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
+    SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
     if (n == 0)
     {
         n = 1;
@@ -224,9 +227,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
 
     _ASSERTE(preSkylakeCount != 0);
 
-    if (sizeof(size_t) <= sizeof(unsigned int))
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
     {
-        // On platforms with a small size_t, prevent overflow on the multiply below
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (preSkylakeCount > MaxCount)
         {
@@ -235,8 +238,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
     }
 
     const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
-    size_t n =
-        (size_t)preSkylakeCount *
+    SIZE_T n =
+        (SIZE_T)preSkylakeCount *
         YieldProcessorNormalization::s_yieldsPerNormalizedYield /
         PreSkylakeCountToSkylakeCountDivisor;
     if (n == 0)
@@ -265,11 +268,11 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
     unsigned int spinIteration)
 {
     // This shift value should be adjusted based on the asserted conditions below
-    const uint8_t MaxShift = 3;
-    static_assert(
-        ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
-    static_assert(
-        ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
+    const UINT8 MaxShift = 3;
+    static_assert_no_msg(
+        ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+    static_assert_no_msg(
+        ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
 
     unsigned int n;
     if (spinIteration <= MaxShift &&

@@ -276,7 +276,7 @@ internal static extern unsafe IntPtr RhpCallPropagateExceptionCallback(
 
         // Indicate that the current round of finalizations is complete.
         [DllImport(Redhawk.BaseName)]
-        internal static extern void RhpSignalFinalizationComplete(uint fCount, int observedFullGcCount);
+        internal static extern void RhpSignalFinalizationComplete(uint fCount);
 
         [DllImport(Redhawk.BaseName)]
         internal static extern ulong RhpGetTickCount64();

@@ -29,14 +29,11 @@ public static void ProcessFinalizers()
                 // otherwise memory is low and we should initiate a collection.
                 if (InternalCalls.RhpWaitForFinalizerRequest() != 0)
                 {
-                    int observedFullGcCount = RuntimeImports.RhGetGcCollectionCount(RuntimeImports.RhGetMaxGcGeneration(), false);
                     uint finalizerCount = DrainQueue();
 
-                    // Anyone waiting to drain the Q can now wake up.  Note that there is a
-                    // race in that another thread starting a drain, as we leave a drain, may
-                    // consider itself satisfied by the drain that just completed.
-                    // Thus we include the Full GC count that we have certaily observed.
-                    InternalCalls.RhpSignalFinalizationComplete(finalizerCount, observedFullGcCount);
+                    // Tell anybody that's interested that the finalization pass is complete (there is a race condition here
+                    // where we might immediately signal a new request as complete, but this is acceptable).
+                    InternalCalls.RhpSignalFinalizationComplete(finalizerCount);
                 }
                 else
                 {

@@ -20,6 +20,7 @@ enum CrstType
     CrstRestrictedCallouts,
     CrstGcStressControl,
     CrstThreadStore,
+    CrstYieldProcessorNormalized,
     CrstEventPipe,
     CrstEventPipeConfig,
     CrstGcEvent,

@@ -48,6 +48,9 @@ uint32_t WINAPI FinalizerStart(void* pContext)
 
     g_pFinalizerThread = PTR_Thread(pThread);
 
+    // We have some time until the first finalization request - use the time to calibrate normalized waits.
+    EnsureYieldProcessorNormalizedInitialized();
+
     // Wait for a finalization request.
     uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE);
     ASSERT(uResult == WAIT_OBJECT_0);
@@ -94,22 +97,6 @@ EXTERN_C void QCALLTYPE RhInitializeFinalizerThread()
     g_FinalizerEvent.Set();
 }
 
-static int32_t g_fullGcCountSeenByFinalization;
-
-// Indicate that the current round of finalizations is complete.
-EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount, int32_t observedFullGcCount)
-{
-    FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());
-
-    g_fullGcCountSeenByFinalization = observedFullGcCount;
-    g_FinalizerDoneEvent.Set();
-
-    if (YieldProcessorNormalization::IsMeasurementScheduled())
-    {
-        YieldProcessorNormalization::PerformMeasurement();
-    }
-}
-
 EXTERN_C void QCALLTYPE RhWaitForPendingFinalizers(UInt32_BOOL allowReentrantWait)
 {
     // This must be called via p/invoke rather than RuntimeImport since it blocks and could starve the GC if
@@ -119,32 +106,13 @@ EXTERN_C void QCALLTYPE RhWaitForPendingFinalizers(UInt32_BOOL allowReentrantWai
     // Can't call this from the finalizer thread itself.
     if (ThreadStore::GetCurrentThread() != g_pFinalizerThread)
     {
-        // We may see a completion of finalization cycle that might not see objects that became
-        // F-reachable in recent GCs. In such case we want to wait for a completion of another cycle.
-        // However, since an object cannot be prevented from promoting, one can only rely on Full GCs
-        // to collect unreferenced objects deterministically. Thus we only care about Full GCs here.
-        int desiredFullGcCount =
-            GCHeapUtilities::GetGCHeap()->CollectionCount(GCHeapUtilities::GetGCHeap()->GetMaxGeneration());
-
-    tryAgain:
         // Clear any current indication that a finalization pass is finished and wake the finalizer thread up
         // (if there's no work to do it'll set the done event immediately).
         g_FinalizerDoneEvent.Reset();
         g_FinalizerEvent.Set();
 
         // Wait for the finalizer thread to get back to us.
         g_FinalizerDoneEvent.Wait(INFINITE, false, allowReentrantWait);
-
-        // we use unsigned math here as the collection counts, which are size_t internally,
-        // can in theory overflow an int and wrap around.
-        // unsigned math would have more defined/portable behavior in such case
-        if ((int)((unsigned int)desiredFullGcCount - (unsigned int)g_fullGcCountSeenByFinalization) > 0)
-        {
-            // There were some Full GCs happening before we started waiting and possibly not seen by the
-            // last finalization cycle. This is rare, but we need to be sure we have seen those,
-            // so we try one more time.
-            goto tryAgain;
-        }
     }
 }
 
@@ -211,6 +179,13 @@ EXTERN_C UInt32_BOOL QCALLTYPE RhpWaitForFinalizerRequest()
     } while (true);
 }
 
+// Indicate that the current round of finalizations is complete.
+EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount)
+{
+    FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());
+    g_FinalizerDoneEvent.Set();
+}
+
 //
 // The following helpers are special in that they interact with internal GC state or directly manipulate
 // managed references so they're called with a special co-operative p/invoke.

@@ -113,4 +113,3 @@ ThreadPoolWorkingThreadCount
 ThreadRunning
 WaitHandleWaitStart
 WaitHandleWaitStop
-YieldProcessorMeasurement
@@ -133,6 +133,8 @@ static bool InitDLL(HANDLE hPalInstance)
 #endif
 #endif // !USE_PORTABLE_HELPERS
 
+    InitializeYieldProcessorNormalizedCrst();
+
 #ifdef STRESS_LOG
     uint32_t dwTotalStressLogSize = (uint32_t)g_pRhConfig->GetTotalStressLogSize();
     uint32_t dwStressLogLevel = (uint32_t)g_pRhConfig->GetStressLogLevel();

@@ -56,26 +56,6 @@ FORCEINLINE int64_t PalInterlockedCompareExchange64(_Inout_ int64_t volatile *pD
     return _InterlockedCompareExchange64(pDst, iValue, iComparand);
 }
 
-#ifdef HOST_X86
-FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
-{
-    int64_t iOldValue;
-    do {
-        iOldValue = *pDst;
-    } while (PalInterlockedCompareExchange64(pDst,
-                                          iValue,
-                                          iOldValue) != iOldValue);
-    return iOldValue;
-}
-#else // HOST_X86
-EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t);
-#pragma intrinsic(_InterlockedExchange64)
-FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
-{
-    return _InterlockedExchange64(pDst, iValue);
-}
-#endif // HOST_X86
-
 #if defined(HOST_AMD64) || defined(HOST_ARM64)
 EXTERN_C uint8_t _InterlockedCompareExchange128(int64_t volatile *, int64_t, int64_t, int64_t *);
 #pragma intrinsic(_InterlockedCompareExchange128)

@@ -15,6 +15,104 @@
 #include "volatile.h"
 #include "yieldprocessornormalized.h"
 
-#include "../../utilcode/yieldprocessornormalized.cpp"
+#define ULONGLONG int64_t
 
-#include "../../vm/yieldprocessornormalizedshared.cpp"
+static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
+static CrstStatic s_initializeYieldProcessorNormalizedCrst;
+
+// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
+// tuned for Skylake processors
+unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
+unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
+
+void InitializeYieldProcessorNormalizedCrst()
+{
+    WRAPPER_NO_CONTRACT;
+    s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized);
+}
+
+static void InitializeYieldProcessorNormalized()
+{
+    WRAPPER_NO_CONTRACT;
+
+    CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);
+
+    if (s_isYieldProcessorNormalizedInitialized)
+    {
+        return;
+    }
+
+    // Intel pre-Skylake processor: measured typically 14-17 cycles per yield
+    // Intel post-Skylake processor: measured typically 125-150 cycles per yield
+    const int MeasureDurationMs = 10;
+    const int NsPerSecond = 1000 * 1000 * 1000;
+
+    ULONGLONG ticksPerSecond = PalQueryPerformanceFrequency();
+
+    if (ticksPerSecond < 1000 / MeasureDurationMs)
+    {
+        // High precision clock not available or clock resolution is too low, resort to defaults
+        s_isYieldProcessorNormalizedInitialized = true;
+        return;
+    }
+
+    // Measure the nanosecond delay per yield
+    ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
+    unsigned int yieldCount = 0;
+      ULONGLONG startTicks = PalQueryPerformanceCounter();
+    ULONGLONG elapsedTicks;
+    do
+    {
+        // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
+        // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
+        // low microsecond range.
+        for (int i = 0; i < 1000; ++i)
+        {
+            System_YieldProcessor();
+        }
+        yieldCount += 1000;
+
+        ULONGLONG nowTicks = PalQueryPerformanceCounter();
+        elapsedTicks = nowTicks - startTicks;
+    } while (elapsedTicks < measureDurationTicks);
+    double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
+    if (nsPerYield < 1)
+    {
+        nsPerYield = 1;
+    }
+
+    // Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
+    // value is naturally limited to MinNsPerNormalizedYield.
+    int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
+    if (yieldsPerNormalizedYield < 1)
+    {
+        yieldsPerNormalizedYield = 1;
+    }
+    _ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);
+
+    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
+    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
+    // better job of allowing other work to run.
+    int optimalMaxNormalizedYieldsPerSpinIteration =
+        (int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
+    if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
+    {
+        optimalMaxNormalizedYieldsPerSpinIteration = 1;
+    }
+
+    g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
+    g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
+    s_isYieldProcessorNormalizedInitialized = true;
+
+    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
+}
+
+void EnsureYieldProcessorNormalizedInitialized()
+{
+    WRAPPER_NO_CONTRACT;
+
+    if (!s_isYieldProcessorNormalizedInitialized)
+    {
+        InitializeYieldProcessorNormalized();
+    }
+}