Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into interlockedgeneric
Browse files Browse the repository at this point in the history
  • Loading branch information
stephentoub committed Jul 18, 2024
2 parents b439797 + 513b503 commit 58e341d
Show file tree
Hide file tree
Showing 39 changed files with 1,326 additions and 723 deletions.
5 changes: 4 additions & 1 deletion eng/pipelines/common/templates/runtimes/run-test-job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ jobs:
timeoutInMinutes: 390
${{ if in(parameters.testGroup, 'gcstress-extra', 'r2r-extra', 'clrinterpreter', 'pgo', 'pgostress', 'jit-experimental') }}:
timeoutInMinutes: 510
${{ if in(parameters.testGroup, 'jitstress-isas-x86', 'jitstress-isas-avx512') }}:
${{ if in(parameters.testGroup, 'jitstress-isas-x86', 'jitstress-isas-avx512', 'jitstress-isas-sve') }}:
timeoutInMinutes: 960

steps:
Expand Down Expand Up @@ -428,6 +428,9 @@ jobs:
- jitstress_isas_x86_evex
- jitstress_isas_x86_noavx512
- jitstressregs0x2000
${{ if in(parameters.testGroup, 'jitstress-isas-sve') }}:
scenarios:
- jitstress_isas_arm64_sve
${{ if in(parameters.testGroup, 'jitstressregs-x86') }}:
scenarios:
- jitstressregs1_x86_noavx
Expand Down
2 changes: 1 addition & 1 deletion eng/pipelines/common/templates/runtimes/test-variables.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ variables:
# gc reliability may take up to 2 hours to shutdown. Some scenarios have very long iteration times.
- name: timeoutPerTestInMinutes
value: 240
- ${{ if in(parameters.testGroup, 'jitstress', 'jitstress-random', 'jitstress-isas-arm', 'jitstress-isas-x86', 'jitstress-isas-avx512', 'jitstressregs-x86', 'jitstressregs', 'jitstress2-jitstressregs', 'jitelthookenabled' ) }}:
- ${{ if in(parameters.testGroup, 'jitstress', 'jitstress-random', 'jitstress-isas-arm', 'jitstress-isas-x86', 'jitstress-isas-avx512', 'jitstress-isas-sve', 'jitstressregs-x86', 'jitstressregs', 'jitstress2-jitstressregs', 'jitelthookenabled' ) }}:
- name: timeoutPerTestCollectionInMinutes
value: 120
- name: timeoutPerTestInMinutes
Expand Down
36 changes: 36 additions & 0 deletions eng/pipelines/coreclr/jitstress-isas-sve.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# This pipeline only runs on GitHub PRs, not on merges.
trigger: none

# Only run on specific changes to the JIT directory that are likely to affect Sve.
pr:
branches:
include:
- main
paths:
include:
- src/coreclr/jit/hwintrinsiccodegenarm64.cpp
- src/coreclr/jit/hwintrinsiclistarm64sve.h
- src/coreclr/jit/hwintrinsicarm64.cpp
- src/coreclr/jit/instrsarm64sve.h
- src/coreclr/jit/emitarm64sve.cpp
- src/coreclr/jit/emitfmtsarm64sve.h
- src/coreclr/jit/lsraarm64.cpp

schedules:
- cron: "30 19 * * 6"
displayName: Sat at 11:30 AM (UTC-8:00)
branches:
include:
- main
always: true

variables:
- template: /eng/pipelines/common/variables.yml

extends:
template: /eng/pipelines/coreclr/templates/jit-outerloop-pipeline.yml
parameters:
platforms:
# just run on windows for now, because abi is universal for other platforms
- windows_x64
testGroup: jitstress-isas-sve
6 changes: 0 additions & 6 deletions src/coreclr/gc/env/gcenv.os.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@
#ifndef __GCENV_OS_H__
#define __GCENV_OS_H__

#ifdef HAS_SYSTEM_YIELDPROCESSOR
// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC.
#undef YieldProcessor
#define YieldProcessor System_YieldProcessor
#endif

#define NUMA_NODE_UNDEFINED UINT16_MAX

bool ParseIndexOrRange(const char** config_string, size_t* start_index, size_t* end_index);
Expand Down
39 changes: 18 additions & 21 deletions src/coreclr/inc/yieldprocessornormalized.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,11 @@

#pragma once

// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
// the intention is to use the system-default implementation of YieldProcessor().
#define HAS_SYSTEM_YIELDPROCESSOR
#ifdef FEATURE_NATIVEAOT
FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
#else
FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
#ifdef YieldProcessor
#undef YieldProcessor
#endif
#define YieldProcessor Dont_Use_YieldProcessor

#define DISABLE_COPY(T) \
T(const T &) = delete; \
Expand Down Expand Up @@ -144,17 +141,17 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
{
_ASSERTE(count != 0);

if (sizeof(SIZE_T) <= sizeof(unsigned int))
if (sizeof(size_t) <= sizeof(unsigned int))
{
// On platforms with a small SIZE_T, prevent overflow on the multiply below
// On platforms with a small size_t, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (count > MaxCount)
{
count = MaxCount;
}
}

SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
size_t n = (size_t)count * normalizationInfo.yieldsPerNormalizedYield;
_ASSERTE(n != 0);
do
{
Expand Down Expand Up @@ -189,9 +186,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
{
_ASSERTE(preSkylakeCount != 0);

if (sizeof(SIZE_T) <= sizeof(unsigned int))
if (sizeof(size_t) <= sizeof(unsigned int))
{
// On platforms with a small SIZE_T, prevent overflow on the multiply below
// On platforms with a small size_t, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (preSkylakeCount > MaxCount)
{
Expand All @@ -200,7 +197,7 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
}

const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
size_t n = (size_t)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
if (n == 0)
{
n = 1;
Expand All @@ -227,9 +224,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl

_ASSERTE(preSkylakeCount != 0);

if (sizeof(SIZE_T) <= sizeof(unsigned int))
if (sizeof(size_t) <= sizeof(unsigned int))
{
// On platforms with a small SIZE_T, prevent overflow on the multiply below
// On platforms with a small size_t, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (preSkylakeCount > MaxCount)
{
Expand All @@ -238,8 +235,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
}

const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
SIZE_T n =
(SIZE_T)preSkylakeCount *
size_t n =
(size_t)preSkylakeCount *
YieldProcessorNormalization::s_yieldsPerNormalizedYield /
PreSkylakeCountToSkylakeCountDivisor;
if (n == 0)
Expand Down Expand Up @@ -268,11 +265,11 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
unsigned int spinIteration)
{
// This shift value should be adjusted based on the asserted conditions below
const UINT8 MaxShift = 3;
static_assert_no_msg(
((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
static_assert_no_msg(
((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
const uint8_t MaxShift = 3;
static_assert(
((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
static_assert(
((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");

unsigned int n;
if (spinIteration <= MaxShift &&
Expand Down
6 changes: 3 additions & 3 deletions src/coreclr/jit/codegenarm64test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6592,11 +6592,11 @@ void CodeGen::genArm64EmitterUnitTestsSve()
// IF_SVE_GV_3A
theEmitter->emitIns_R_R_R_I_I(INS_sve_fcmla, EA_SCALABLE, REG_V0, REG_V1, REG_V0, 0, 0,
INS_OPTS_SCALABLE_S); // FCMLA <Zda>.S, <Zn>.S, <Zm>.S[<imm>], <const>
theEmitter->emitIns_R_R_R_I_I(INS_sve_fcmla, EA_SCALABLE, REG_V2, REG_V3, REG_V5, 1, 90,
theEmitter->emitIns_R_R_R_I_I(INS_sve_fcmla, EA_SCALABLE, REG_V2, REG_V3, REG_V5, 1, 1,
INS_OPTS_SCALABLE_S); // FCMLA <Zda>.S, <Zn>.S, <Zm>.S[<imm>], <const>
theEmitter->emitIns_R_R_R_I_I(INS_sve_fcmla, EA_SCALABLE, REG_V4, REG_V5, REG_V10, 0, 180,
theEmitter->emitIns_R_R_R_I_I(INS_sve_fcmla, EA_SCALABLE, REG_V4, REG_V5, REG_V10, 0, 2,
INS_OPTS_SCALABLE_S); // FCMLA <Zda>.S, <Zn>.S, <Zm>.S[<imm>], <const>
theEmitter->emitIns_R_R_R_I_I(INS_sve_fcmla, EA_SCALABLE, REG_V6, REG_V7, REG_V15, 1, 270,
theEmitter->emitIns_R_R_R_I_I(INS_sve_fcmla, EA_SCALABLE, REG_V6, REG_V7, REG_V15, 1, 3,
INS_OPTS_SCALABLE_S); // FCMLA <Zda>.S, <Zn>.S, <Zm>.S[<imm>], <const>

// IF_SVE_GX_3A
Expand Down
13 changes: 6 additions & 7 deletions src/coreclr/jit/emitarm64sve.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5825,14 +5825,13 @@ void emitter::emitInsSve_R_R_R_I_I(instruction ins,

case INS_sve_fcmla:
assert(opt == INS_OPTS_SCALABLE_S);
assert(isVectorRegister(reg1)); // ddddd
assert(isVectorRegister(reg2)); // nnnnn
assert(isLowVectorRegister(reg3)); // mmmm
assert(isValidUimm<1>(imm1)); // i
assert(isValidRot(imm2)); // rr
assert(isVectorRegister(reg1)); // ddddd
assert(isVectorRegister(reg2)); // nnnnn
assert(isLowVectorRegister(reg3)); // mmmm
assert(isValidUimm<1>(imm1)); // i
assert(emitIsValidEncodedRotationImm0_to_270(imm2)); // rr

// Convert imm2 from rotation value (0-270) to bitwise representation (0-3)
imm = (imm1 << 2) | emitEncodeRotationImm0_to_270(imm2);
imm = (imm1 << 2) | imm2;
fmt = IF_SVE_GV_3A;
break;

Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30437,10 +30437,10 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
#if defined(TARGET_XARCH)
tryHandle = op->OperIsHWIntrinsic();
#elif defined(TARGET_ARM64)
if (op->OperIsHWIntrinsic() && op->OperIsHWIntrinsic(NI_Sve_CreateTrueMaskAll))
if (op->OperIsHWIntrinsic(NI_Sve_CreateTrueMaskAll))
{
op = op2;
tryHandle = true;
tryHandle = op->OperIsHWIntrinsic();
}
#endif // TARGET_ARM64

Expand Down
4 changes: 1 addition & 3 deletions src/coreclr/jit/hwintrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -963,9 +963,7 @@ static void ValidateHWIntrinsicInfo(CORINFO_InstructionSet isa, NamedIntrinsic n
if (info.numArgs != -1)
{
// We should only have an expected number of arguments
#if defined(TARGET_ARM64)
assert((info.numArgs >= 0) && (info.numArgs <= 4));
#elif defined(TARGET_XARCH)
#if defined(TARGET_ARM64) || defined(TARGET_XARCH)
assert((info.numArgs >= 0) && (info.numArgs <= 5));
#else
unreached();
Expand Down
13 changes: 13 additions & 0 deletions src/coreclr/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
Expand Up @@ -1085,6 +1085,14 @@ struct HWIntrinsicInfo
break;
}

case NI_Sve_MultiplyAddRotateComplexBySelectedScalar:
{
assert(sig->numArgs == 5);
*imm1Pos = 0;
*imm2Pos = 1;
break;
}

default:
{
assert(sig->numArgs > 0);
Expand All @@ -1105,6 +1113,7 @@ struct HWIntrinsic final
, op2(nullptr)
, op3(nullptr)
, op4(nullptr)
, op5(nullptr)
, numOperands(0)
, baseType(TYP_UNDEF)
{
Expand Down Expand Up @@ -1134,6 +1143,7 @@ struct HWIntrinsic final
GenTree* op2;
GenTree* op3;
GenTree* op4;
GenTree* op5;
size_t numOperands;
var_types baseType;

Expand All @@ -1144,6 +1154,9 @@ struct HWIntrinsic final

switch (numOperands)
{
case 5:
op5 = node->Op(5);
FALLTHROUGH;
case 4:
op4 = node->Op(4);
FALLTHROUGH;
Expand Down
66 changes: 66 additions & 0 deletions src/coreclr/jit/hwintrinsicarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,27 @@ void HWIntrinsicInfo::lookupImmBounds(
immUpperBound = 3;
break;

case NI_Sve_MultiplyAddRotateComplexBySelectedScalar:
// rotation comes after index in the intrinsic's signature,
// but flip the order here so we check the larger range first.
// This conforms to the existing logic in LinearScan::BuildHWIntrinsic
// when determining if we need an internal register for the jump table.
// This flipped ordering is reflected in HWIntrinsicInfo::GetImmOpsPositions.
if (immNumber == 1)
{
// Bounds for rotation
immLowerBound = 0;
immUpperBound = 3;
}
else
{
// Bounds for index
assert(immNumber == 2);
immLowerBound = 0;
immUpperBound = 1;
}
break;

case NI_Sve_TrigonometricMultiplyAddCoefficient:
immLowerBound = 0;
immUpperBound = 7;
Expand Down Expand Up @@ -3004,6 +3025,51 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_Sve_MultiplyAddRotateComplexBySelectedScalar:
{
assert(sig->numArgs == 5);
assert(!isScalar);

CORINFO_ARG_LIST_HANDLE arg1 = sig->args;
CORINFO_ARG_LIST_HANDLE arg2 = info.compCompHnd->getArgNext(arg1);
CORINFO_ARG_LIST_HANDLE arg3 = info.compCompHnd->getArgNext(arg2);
CORINFO_ARG_LIST_HANDLE arg4 = info.compCompHnd->getArgNext(arg3);
CORINFO_ARG_LIST_HANDLE arg5 = info.compCompHnd->getArgNext(arg4);
var_types argType = TYP_UNKNOWN;
CORINFO_CLASS_HANDLE argClass = NO_CLASS_HANDLE;

int imm1LowerBound, imm1UpperBound; // Range for rotation
int imm2LowerBound, imm2UpperBound; // Range for index
HWIntrinsicInfo::lookupImmBounds(intrinsic, simdSize, simdBaseType, 1, &imm1LowerBound, &imm1UpperBound);
HWIntrinsicInfo::lookupImmBounds(intrinsic, simdSize, simdBaseType, 2, &imm2LowerBound, &imm2UpperBound);

argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg5, &argClass)));
GenTree* op5 = getArgForHWIntrinsic(argType, argClass);
assert(HWIntrinsicInfo::isImmOp(intrinsic, op5));
op5 = addRangeCheckIfNeeded(intrinsic, op5, mustExpand, imm1LowerBound, imm1UpperBound);

argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg4, &argClass)));
op4 = getArgForHWIntrinsic(argType, argClass);
assert(HWIntrinsicInfo::isImmOp(intrinsic, op4));
op4 = addRangeCheckIfNeeded(intrinsic, op4, mustExpand, imm2LowerBound, imm2UpperBound);

argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg3, &argClass)));
op3 = getArgForHWIntrinsic(argType, argClass);
argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg2, &argClass)));
op2 = getArgForHWIntrinsic(argType, argClass);
argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg1, &argClass)));
op1 = getArgForHWIntrinsic(argType, argClass);

SetOpLclRelatedToSIMDIntrinsic(op1);
SetOpLclRelatedToSIMDIntrinsic(op2);
SetOpLclRelatedToSIMDIntrinsic(op3);
SetOpLclRelatedToSIMDIntrinsic(op4);
SetOpLclRelatedToSIMDIntrinsic(op5);
retNode = new (this, GT_HWINTRINSIC) GenTreeHWIntrinsic(retType, getAllocator(CMK_ASTNode), intrinsic,
simdBaseJitType, simdSize, op1, op2, op3, op4, op5);
break;
}

default:
{
return nullptr;
Expand Down
Loading

0 comments on commit 58e341d

Please sign in to comment.