Revert "Port yield normalization from CoreCLR to Native AOT (#103675)"

This reverts commit d35f302.
dotnet · Jul 26, 2024 · 45de37b · 45de37b
1 parent f6b4960
commit 45de37b
Show file tree

Hide file tree

Showing 12 changed files with 653 additions and 390 deletions.
diff --git a/src/coreclr/gc/env/gcenv.os.h b/src/coreclr/gc/env/gcenv.os.h
@@ -6,6 +6,12 @@
 #ifndef __GCENV_OS_H__
 #define __GCENV_OS_H__
 
+#ifdef HAS_SYSTEM_YIELDPROCESSOR
+// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC.
+#undef YieldProcessor
+#define YieldProcessor System_YieldProcessor
+#endif
+
 #define NUMA_NODE_UNDEFINED UINT16_MAX
 
 bool ParseIndexOrRange(const char** config_string, size_t* start_index, size_t* end_index);

diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
@@ -3,11 +3,14 @@
 
 #pragma once
 
-#ifdef FEATURE_NATIVEAOT
-FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
-#else
+// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
+// the intention is to use the system-default implementation of YieldProcessor().
+#define HAS_SYSTEM_YIELDPROCESSOR
 FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
+#ifdef YieldProcessor
+#undef YieldProcessor
 #endif
+#define YieldProcessor Dont_Use_YieldProcessor
 
 #define DISABLE_COPY(T) \
     T(const T &) = delete; \
@@ -141,17 +144,17 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
 {
     _ASSERTE(count != 0);
 
-    if (sizeof(size_t) <= sizeof(unsigned int))
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
     {
-        // On platforms with a small size_t, prevent overflow on the multiply below
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (count > MaxCount)
         {
             count = MaxCount;
         }
     }
 
-    size_t n = (size_t)count * normalizationInfo.yieldsPerNormalizedYield;
+    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
     _ASSERTE(n != 0);
     do
     {
@@ -186,9 +189,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
 {
     _ASSERTE(preSkylakeCount != 0);
 
-    if (sizeof(size_t) <= sizeof(unsigned int))
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
     {
-        // On platforms with a small size_t, prevent overflow on the multiply below
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (preSkylakeCount > MaxCount)
         {
@@ -197,7 +200,7 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
     }
 
     const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
-    size_t n = (size_t)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
+    SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
     if (n == 0)
     {
         n = 1;
@@ -224,9 +227,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
 
     _ASSERTE(preSkylakeCount != 0);
 
-    if (sizeof(size_t) <= sizeof(unsigned int))
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
     {
-        // On platforms with a small size_t, prevent overflow on the multiply below
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (preSkylakeCount > MaxCount)
         {
@@ -235,8 +238,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
     }
 
     const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
-    size_t n =
-        (size_t)preSkylakeCount *
+    SIZE_T n =
+        (SIZE_T)preSkylakeCount *
         YieldProcessorNormalization::s_yieldsPerNormalizedYield /
         PreSkylakeCountToSkylakeCountDivisor;
     if (n == 0)
@@ -265,11 +268,11 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
     unsigned int spinIteration)
 {
     // This shift value should be adjusted based on the asserted conditions below
-    const uint8_t MaxShift = 3;
-    static_assert(
-        ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
-    static_assert(
-        ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
+    const UINT8 MaxShift = 3;
+    static_assert_no_msg(
+        ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+    static_assert_no_msg(
+        ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
 
     unsigned int n;
     if (spinIteration <= MaxShift &&

diff --git a/src/coreclr/nativeaot/Runtime/Crst.h b/src/coreclr/nativeaot/Runtime/Crst.h
@@ -20,6 +20,7 @@ enum CrstType
     CrstRestrictedCallouts,
     CrstGcStressControl,
     CrstThreadStore,
+    CrstYieldProcessorNormalized,
     CrstEventPipe,
     CrstEventPipeConfig,
     CrstGcEvent,

diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
@@ -48,6 +48,9 @@ uint32_t WINAPI FinalizerStart(void* pContext)
 
     g_pFinalizerThread = PTR_Thread(pThread);
 
+    // We have some time until the first finalization request - use the time to calibrate normalized waits.
+    EnsureYieldProcessorNormalizedInitialized();
+
     // Wait for a finalization request.
     uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE);
     ASSERT(uResult == WAIT_OBJECT_0);
@@ -181,11 +184,6 @@ EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount)
 {
     FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());
     g_FinalizerDoneEvent.Set();
-
-    if (YieldProcessorNormalization::IsMeasurementScheduled())
-    {
-        YieldProcessorNormalization::PerformMeasurement();
-    }
 }
 
 //

diff --git a/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst b/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst
@@ -113,4 +113,3 @@ ThreadPoolWorkingThreadCount
 ThreadRunning
 WaitHandleWaitStart
 WaitHandleWaitStop
-YieldProcessorMeasurement
diff --git a/src/coreclr/nativeaot/Runtime/startup.cpp b/src/coreclr/nativeaot/Runtime/startup.cpp
@@ -133,6 +133,8 @@ static bool InitDLL(HANDLE hPalInstance)
 #endif
 #endif // !USE_PORTABLE_HELPERS
 
+    InitializeYieldProcessorNormalizedCrst();
+
 #ifdef STRESS_LOG
     uint32_t dwTotalStressLogSize = (uint32_t)g_pRhConfig->GetTotalStressLogSize();
     uint32_t dwStressLogLevel = (uint32_t)g_pRhConfig->GetStressLogLevel();

diff --git a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
@@ -56,26 +56,6 @@ FORCEINLINE int64_t PalInterlockedCompareExchange64(_Inout_ int64_t volatile *pD
     return _InterlockedCompareExchange64(pDst, iValue, iComparand);
 }
 
-#ifdef HOST_X86
-FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
-{
-    int64_t iOldValue;
-    do {
-        iOldValue = *pDst;
-    } while (PalInterlockedCompareExchange64(pDst,
-                                          iValue,
-                                          iOldValue) != iOldValue);
-    return iOldValue;
-}
-#else // HOST_X86
-EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t);
-#pragma intrinsic(_InterlockedExchange64)
-FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
-{
-    return _InterlockedExchange64(pDst, iValue);
-}
-#endif // HOST_X86
-
 #if defined(HOST_AMD64) || defined(HOST_ARM64)
 EXTERN_C uint8_t _InterlockedCompareExchange128(int64_t volatile *, int64_t, int64_t, int64_t *);
 #pragma intrinsic(_InterlockedCompareExchange128)

diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
@@ -15,6 +15,104 @@
 #include "volatile.h"
 #include "yieldprocessornormalized.h"
 
-#include "../../utilcode/yieldprocessornormalized.cpp"
+#define ULONGLONG int64_t
 
-#include "../../vm/yieldprocessornormalizedshared.cpp"
+static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
+static CrstStatic s_initializeYieldProcessorNormalizedCrst;
+
+// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
+// tuned for Skylake processors
+unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
+unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
+
+void InitializeYieldProcessorNormalizedCrst()
+{
+    WRAPPER_NO_CONTRACT;
+    s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized);
+}
+
+static void InitializeYieldProcessorNormalized()
+{
+    WRAPPER_NO_CONTRACT;
+
+    CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);
+
+    if (s_isYieldProcessorNormalizedInitialized)
+    {
+        return;
+    }
+
+    // Intel pre-Skylake processor: measured typically 14-17 cycles per yield
+    // Intel post-Skylake processor: measured typically 125-150 cycles per yield
+    const int MeasureDurationMs = 10;
+    const int NsPerSecond = 1000 * 1000 * 1000;
+
+    ULONGLONG ticksPerSecond = PalQueryPerformanceFrequency();
+
+    if (ticksPerSecond < 1000 / MeasureDurationMs)
+    {
+        // High precision clock not available or clock resolution is too low, resort to defaults
+        s_isYieldProcessorNormalizedInitialized = true;
+        return;
+    }
+
+    // Measure the nanosecond delay per yield
+    ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
+    unsigned int yieldCount = 0;
+      ULONGLONG startTicks = PalQueryPerformanceCounter();
+    ULONGLONG elapsedTicks;
+    do
+    {
+        // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
+        // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
+        // low microsecond range.
+        for (int i = 0; i < 1000; ++i)
+        {
+            System_YieldProcessor();
+        }
+        yieldCount += 1000;
+
+        ULONGLONG nowTicks = PalQueryPerformanceCounter();
+        elapsedTicks = nowTicks - startTicks;
+    } while (elapsedTicks < measureDurationTicks);
+    double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
+    if (nsPerYield < 1)
+    {
+        nsPerYield = 1;
+    }
+
+    // Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
+    // value is naturally limited to MinNsPerNormalizedYield.
+    int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
+    if (yieldsPerNormalizedYield < 1)
+    {
+        yieldsPerNormalizedYield = 1;
+    }
+    _ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);
+
+    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
+    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
+    // better job of allowing other work to run.
+    int optimalMaxNormalizedYieldsPerSpinIteration =
+        (int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
+    if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
+    {
+        optimalMaxNormalizedYieldsPerSpinIteration = 1;
+    }
+
+    g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
+    g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
+    s_isYieldProcessorNormalizedInitialized = true;
+
+    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
+}
+
+void EnsureYieldProcessorNormalizedInitialized()
+{
+    WRAPPER_NO_CONTRACT;
+
+    if (!s_isYieldProcessorNormalizedInitialized)
+    {
+        InitializeYieldProcessorNormalized();
+    }
+}