Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initialize values for yield normalization during startup #99991

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/coreclr/inc/yieldprocessornormalized.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ class YieldProcessorNormalization
return s_isMeasurementScheduled;
}

static void NotifyGC();

static void PerformMeasurement();

private:
Expand Down
7 changes: 7 additions & 0 deletions src/coreclr/vm/ceemain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@
#include "jithost.h"
#include "pgo.h"
#include "pendingload.h"
#include "yieldprocessornormalized.h"

#ifndef TARGET_UNIX
#include "dwreport.h"
Expand Down Expand Up @@ -779,6 +780,10 @@ void EEStartupHelper()
StubPrecode::StaticInitialize();
FixupPrecode::StaticInitialize();

// Perform some measurements before garbage collector is initialized
// so GC initialization and the first few GCs can use the normalized yield
YieldProcessorNormalization::PerformMeasurement();

InitializeGarbageCollector();

if (!GCHandleUtilities::GetGCHandleManager()->Initialize())
Expand Down Expand Up @@ -878,6 +883,8 @@ void EEStartupHelper()
LogErrorToHost("GC heap initialization failed with error 0x%08X", hr);
}

YieldProcessorNormalization::NotifyGC();

IfFailGo(hr);

#ifdef FEATURE_PERFTRACING
Expand Down
74 changes: 59 additions & 15 deletions src/coreclr/vm/yieldprocessornormalized.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@
enum class NormalizationState : UINT8
{
Uninitialized,
PartiallyInitialized,
Initialized,
Failed
};

static const int NsPerYieldMeasurementCount = 8;
static const int PartialInitializationMeasurementCount = 2; // number of measurements to be done during partial initialization
static const unsigned int MeasurementPeriodMs = 4000;

static const unsigned int NsPerS = 1000 * 1000 * 1000;
Expand Down Expand Up @@ -108,6 +110,21 @@ static double MeasureNsPerYield(unsigned int measureDurationUs)
return Max(MinNsPerYield, Min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield));
}

void YieldProcessorNormalization::NotifyGC()
{
CONTRACTL
{
NOTHROW;
GC_NOTRIGGER;
MODE_PREEMPTIVE;
}
CONTRACTL_END;

_ASSERTE(GCHeapUtilities::IsGCHeapInitialized());

GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)s_yieldsPerNormalizedYield);
}

void YieldProcessorNormalization::PerformMeasurement()
{
CONTRACTL
Expand All @@ -118,7 +135,7 @@ void YieldProcessorNormalization::PerformMeasurement()
}
CONTRACTL_END;

_ASSERTE(s_isMeasurementScheduled);
_ASSERTE(s_isMeasurementScheduled ^ (s_normalizationState == NormalizationState::Uninitialized));

double latestNsPerYield;
if (s_normalizationState == NormalizationState::Initialized)
Expand All @@ -137,19 +154,32 @@ void YieldProcessorNormalization::PerformMeasurement()
}
s_nextMeasurementIndex = nextMeasurementIndex;
}
else if (s_normalizationState == NormalizationState::Uninitialized)
else if (s_normalizationState == NormalizationState::Uninitialized || s_normalizationState == NormalizationState::PartiallyInitialized)
{
LARGE_INTEGER li;
if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000)
int startIndex, endIndex;
if (s_normalizationState == NormalizationState::Uninitialized)
{
// High precision clock not available or clock resolution is too low, resort to defaults
s_normalizationState = NormalizationState::Failed;
return;
LARGE_INTEGER li;
if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is somewhat pre-existing to the PR here, but I think there's some latent bugs in the general measurement logic here...

Notably, Linux always reports 1 billion here for the frequency (1 nanosecond ticks) even if the underlying hardware timer is actually much slower.

Additionally, there are places below where we try to measure the length of time between two QPC calls and while QPC is guaranteed to be monotonic (never decreasing) there is no guarantee that two separate calls will not return the same value (which can occur if they happen faster than the underlying timer resolution). A simple program such as the following shows this on most modern hardware:

using System.Runtime.InteropServices;

internal unsafe class Program
{
    private static void Main(string[] args)
    {
        long start = 0;
        _ = QueryPerformanceCounter(&start);

        long end = 0;
        _ = QueryPerformanceCounter(&end);

        Console.WriteLine(end - start); // likely prints 0
    }

    [DllImport("kernel32", ExactSpelling = true)]
    [SuppressGCTransition]
    static extern int QueryPerformanceCounter(long* lpPerformanceCount);
}

Additionally, on some hardware the underlying mechanism used by QPC could have a resolution as low as 10-20 cycles or it could have a latency of nearly 100-300 ns (detailed in part here: https://learn.microsoft.com/en-us/windows/win32/sysinfo/acquiring-high-resolution-time-stamps). While the pause instruction (x86/x64) can execute somewhere around 10-20 cycles, 40 cycles, or up to 150 cycles (depending on the hardware in question, with newer and higher core count machines often explicitly having longer delay times). All of these cycle times could be potentially much less than the 100-300ns latency that a QPC call might have which can lead to skewed measurements.

We've also seen that yield on Arm64 is known to be a hint which can often be ignored or so low that many other ecosystems have switched to using the isb instruction instead to achieve a similar effect to the x86/x64 pause instruction.

It might be good to take a deeper look at what this code is actually doing and how accurate it actually is compared to mechanisms which are known to more accurately measure the actual instruction latency in a platform specific manner.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Notably, Linux always reports 1 billion here for the frequency (1 nanosecond ticks) even if the underlying hardware timer is actually much slower.

Stopwatch.Frequency also seems to report similarly. The tick count reports ticks in nanoseconds. I don't think this affects the measurements, just the frequency doesn't indicate the actual underlying frequency of the clock. The actual frequency of the clock also does not necessarily reflect the total overhead involved in getting the ticks.

Additionally, there are places below where we try to measure the length of time between two QPC calls and while QPC is guaranteed to be monotonic (never decreasing) there is no guarantee that two separate calls will not return the same value (which can occur if they happen faster than the underlying timer resolution).

I don't see any assumption being made in this code that the difference wouldn't be zero.

Additionally, on some hardware the underlying mechanism used by QPC could have a resolution as low as 10-20 cycles or it could have a latency of nearly 100-300 ns (detailed in part here: https://learn.microsoft.com/en-us/windows/win32/sysinfo/acquiring-high-resolution-time-stamps). While the pause instruction (x86/x64) can execute somewhere around 10-20 cycles, 40 cycles, or up to 150 cycles (depending on the hardware in question, with newer and higher core count machines often explicitly having longer delay times). All of these cycle times could be potentially much less than the 100-300ns latency that a QPC call might have which can lead to skewed measurements.

Yes that's why the QPC overhead is measured and the measurement duration is adjusted (with some reasonable limits). Previously the measurement was taken over a 10 ms interval but that was too much, and taking multiple shorter measurements as currently done showed similar measurement results even on systems with higher QPC overhead.

We've also seen that yield on Arm64 is known to be a hint which can often be ignored or so low that many other ecosystems have switched to using the isb instruction instead to achieve a similar effect to the x86/x64 pause instruction.

The normalization should be able to figure that out and compensate by using more iterations. isb may be more useful when there is no normalization, to induce a more similar delay. isb appears to have other side effects (involving an instruction barrier) that may be unnecessary for spin-waiting.

This is somewhat pre-existing to the PR here, but I think there's some latent bugs in the general measurement logic here...

Let me know if you see any specific issues, I can take a look.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Notably, Linux always reports 1 billion here for the frequency (1 nanosecond ticks) even if the underlying hardware timer is actually much slower.

For instance, suppose the actual clock resolution is 1 us or higher (quite high, though perhaps possible). Each measurement would then typically be shorter than that duration, with some being shorter and some longer. The chosen measurement that is used for normalization favors a lower ns-per-yield, which would come from the longer measurements and would be a bit closer to the actual delay.

Copy link
Member

@tannergooding tannergooding Mar 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Stopwatch.Frequency also seems to report similarly. The tick count reports ticks in nanoseconds. I don't think this affects the measurements, just the frequency doesn't indicate the actual underlying frequency of the clock. The actual frequency of the clock also does not necessarily reflect the total overhead involved in getting the ticks.

That's because Stopwatch.Frequency just calls into QPF, which is implemented in PAL as a call to the underlying CLOCK_MONOTONIC (or similar), which always reports 1 ns ticks.

The general consideration here is we have logic such as this https://github.com/dotnet/runtime/blob/cf053aba5c42ad94fe696238ff40f28ec82c7362/src/coreclr/vm/yieldprocessornormalized.cpp#L148C13-L153C14

Where the first check (did QPF fail) cannot be triggered on Windows and where the latter check (li.QuadPart < 1_000_000) cannot be triggered on Linux. Given the intent of the check, the general logic here is "flawed" and isn't actually accounting for whether normalization can be properly done. -- This is particularly relevant as we're much more likely to encounter a low clock resolution device on Linux where embedded and low-end/legacy device support is more prevalent.

I don't see any assumption being made in this code that the difference wouldn't be zero.

I think the fact that we aren't explicitly checking for and handling that is the problem.

This call is trying to detect overhead and its registering it as 0:

QueryPerformanceCounter(&li);
UINT64 startTicks = li.QuadPart;
QueryPerformanceCounter(&li);
UINT64 elapsedTicks = li.QuadPart - startTicks;
if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration
{
measureDurationUs *= 4;
}
return measureDurationUs;
, thus returning a MeasureDuration of 1us without actually confirming, for example, that 1us is a reasonable measurement duration.

This value is passed into MeasureNsPerYield: https://github.com/dotnet/runtime/blob/cf053aba5c42ad94fe696238ff40f28ec82c7362/src/coreclr/vm/yieldprocessornormalized.cpp#L59C46-L59C63. yieldCount will on many machines simply be 28 and measureDurationTicks will be 10.

On a machine, like the Ryzen 7950X which has documented pause to be 64 cycles and therefore to take ~14ns to execute (base clock of 4.5GHz, can boost up to 5.7GHz), this is generally sufficient and gets roughly the right times. However, I've seen it compute anywhere up to 30ns per pause and down to 10ns, such as if the core it happens to measure is in a low-power state or have just finished a boost state. This variance can cause quite a bit of noise in some benchmarks or even just general app execution.
 
We then have some logic just below that which tries to account for inaccurate measures.:

// Limit the minimum to a reasonable value considering that on some systems a yield may be implemented as a no-op
const double MinNsPerYield = 0.1;
// Measured values higher than this don't affect values calculated for normalization, and it's very unlikely for a yield to
// really take this long. Limit the maximum to keep the recorded values reasonable.
const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1;
return Max(MinNsPerYield, Min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield));

However, the numbers are a bit odd as there is no such thing as a 10GHz processor and so 0.1 as a MinNsPerYield is completely unrealistic value. Beyond that, a more typical processor is around 2.5GHz (often more for Desktop or Workstation, sometimes less for Mobile) and so a reasonable yield time (based on actual known data) would likely be at a minimum of around 10 cycles, given around 2-4ns per yield as a reasonable minimum.

The normalization should be able to figure that out and compensate by using more iterations. isb may be more useful when there is no normalization, to induce a more similar delay. isb appears to have other side effects (involving an instruction barrier) that may be unnecessary for spin-waiting.

The main consideration is that if the underlying yield/pause instruction is truly a no-op, then we likely should be falling back to an alternative sequence that better matches the actual intent of the spin loop. If we're literally just executing what is functionally an empty while() loop, that's not great for the system.

There's a lot of cases, particularly for Arm64 systems, where we're seeing tons of context switches compared to an equivalent x64 machine. We're also spending for more time executing instructions in places like the underlying Semaphore spin loop used by the thread pool.

So not only are we spinning and not letting the processor acknowledge the potential for a proper lower power state, but, we're doing a lot of potentially expensive work on the core that could be getting spent elsewhere.

For example, on my Volterra, it's measuring around 0.5-5ns per yield, with some of those being done on BIG cores and some being done on LITTLE cores, which means that separate measurements of yield could be problematic. If you just spin doing MeasureNsPerYield forever, measing the min and max seen over time and the ProcessorId it's running on, for example:

min: 0.4478280340349306 5
max: 0.4478280340349306 6
min: 0.42662116040955633 6
max: 0.5071461502996772 5
min: 0.41736227045075125 6
min: 0.3985651654045436 5
max: 2.345415778251599 6
max: 2.487562189054726 5
min: 0.3838771593090211 6
min: 0.37091988130563797 5
max: 36.47746243739566 6
max: 89.28571428571429 5
max: 173.2193732193732 6
max: 174.28571428571428 0
max: 182.33333333333334 6

With the really high numbers (89 and 173-182) being caused by the OS context switches -- Similar happens on the 7950X:

min: 12.76595744680851 8
max: 12.76595744680851 8
max: 13.924050632911392 8
min: 12.658227848101266 8
min: 11.702127659574469 8
max: 16.949152542372882 8
max: 24.390243902439025 30
max: 26.829268292682926 26
max: 81.69014084507042 8
max: 142.85714285714286 8
max: 182.33333333333334 0

Copy link
Member

@tannergooding tannergooding Mar 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these unaccounted for nuances are causing us issues, especially for Arm64 on Linux and are part of why the retired instruction counts, the number of context switches (which are currently more expensive for Arm64), and other metrics are so much higher.

I would imagine some of them would also cause issues for things like Intel Alder Lake CPUs where you have Power and Efficiency cores with very different cycle times and base/max clock speeds.

For example, the same code running on WSL or an actual direct boot to Linux (Ubuntu 22.04) reports the following for Ryzen 7950X:

min: 8.333333333333334 23
max: 8.333333333333334 23
min: 7.046979865771812 23
min: 6.709677419354839 23
min: 6.580645161290323 23
min: 6.538461538461538 23
max: 9.0625 23
max: 10.412371134020619 23
max: 40.796460176991154 23
max: 54.01639344262295 23
max: 102.53571428571429 23
max: 181.78571428571428 23
max: 182.14285714285714 23
max: 182.33333333333334 23
min: 6.5 23
min: 6.4743589743589745 23
min: 6.3354037267080745 23
min: 6.31875 23
min: 6.3125 23
min: 6.273291925465839 22

And the following for Volterra:

min: 0.7791195948578107 2
max: 0.7791195948578107 2
min: 0.5340114431023522 2
min: 0.43294614572333684 2
min: 0.41355658664514827 2
min: 0.3994154895275207 2
min: 0.3990119703591108 2
min: 0.3896736483195324 2
min: 0.38767019667170954 2
min: 0.3806164129223914 2
min: 0.3795242062390077 2
min: 0.37821482602118 2
max: 15.130674002751032 2
max: 32.18707015130674 2
min: 0.3668715032559846 2
max: 37.61467889908257 2
max: 53.211009174311926 2
max: 55.04587155963303 2
max: 166.9724770642202 2
max: 182.33333333333334 2

So the same code on the same hardware is reporting significantly different results in Linux and not measuring something which really does what we're wanting for Arm64 as its taking so little time.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tannergooding, can you file an issue? We can continue the discussion there.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Logged #100242

{
// High precision clock not available or clock resolution is too low, resort to defaults
s_normalizationState = NormalizationState::Failed;
return;
}
s_performanceCounterTicksPerS = li.QuadPart;

startIndex = 0;
endIndex = PartialInitializationMeasurementCount;
}
else
{
startIndex = PartialInitializationMeasurementCount;
endIndex = NsPerYieldMeasurementCount;
}
s_performanceCounterTicksPerS = li.QuadPart;

unsigned int measureDurationUs = DetermineMeasureDurationUs();
for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
latestNsPerYield = 0;
for (int i = startIndex; i < endIndex; ++i)
{
latestNsPerYield = MeasureNsPerYield(measureDurationUs);
AtomicStore(&s_nsPerYieldMeasurements[i], latestNsPerYield);
Expand All @@ -158,7 +188,7 @@ void YieldProcessorNormalization::PerformMeasurement()
AtomicStore(&s_establishedNsPerYield, latestNsPerYield);
}

if (i < NsPerYieldMeasurementCount - 1)
if (i < endIndex - 1)
{
FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
eduardo-vp marked this conversation as resolved.
Show resolved Hide resolved
}
Expand All @@ -171,7 +201,11 @@ void YieldProcessorNormalization::PerformMeasurement()
}

double establishedNsPerYield = s_nsPerYieldMeasurements[0];
for (int i = 1; i < NsPerYieldMeasurementCount; ++i)
int endIndex =
(s_normalizationState == NormalizationState::Uninitialized) ?
PartialInitializationMeasurementCount :
NsPerYieldMeasurementCount;
for (int i = 1; i < endIndex; ++i)
{
double nsPerYield = s_nsPerYieldMeasurements[i];
if (nsPerYield < establishedNsPerYield)
Expand All @@ -198,10 +232,20 @@ void YieldProcessorNormalization::PerformMeasurement()
Max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5));
_ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);

GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
if (GCHeapUtilities::IsGCHeapInitialized())
{
NotifyGC();
}

if (s_normalizationState != NormalizationState::Uninitialized)
{
s_previousNormalizationTimeMs = GetTickCount();
}

s_previousNormalizationTimeMs = GetTickCount();
s_normalizationState = NormalizationState::Initialized;
s_normalizationState =
(s_normalizationState == NormalizationState::Uninitialized) ?
NormalizationState::PartiallyInitialized :
NormalizationState::Initialized;
s_isMeasurementScheduled = false;
}

Expand All @@ -224,7 +268,7 @@ void YieldProcessorNormalization::ScheduleMeasurementIfNecessary()
return;
}
}
else if (normalizationState == NormalizationState::Uninitialized)
else if (normalizationState == NormalizationState::Uninitialized || normalizationState == NormalizationState::PartiallyInitialized)
{
}
else
Expand Down