Skip to content

Commit

Permalink
Adding support for X86Base.Pause() and ArmBase.Yield() (#61065)
Browse files Browse the repository at this point in the history
* Adding support for X86Base.Pause() and ArmBase.Yield()

* Applying formatting patch

* Ensure NI_ArmBase_Yield actually gets through to codegen on arm64
  • Loading branch information
tannergooding committed Nov 15, 2021
1 parent a93e0d2 commit fdafc7c
Show file tree
Hide file tree
Showing 25 changed files with 273 additions and 38 deletions.
1 change: 1 addition & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -3318,6 +3318,7 @@ class Compiler
unsigned simdSize,
bool isSimdAsHWIntrinsic);

GenTreeHWIntrinsic* gtNewScalarHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID);
GenTreeHWIntrinsic* gtNewScalarHWIntrinsicNode(var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID);
GenTreeHWIntrinsic* gtNewScalarHWIntrinsicNode(var_types type,
GenTree* op1,
Expand Down
32 changes: 17 additions & 15 deletions src/coreclr/jit/emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -1240,21 +1240,22 @@ class emitter

#define PERFSCORE_THROUGHPUT_1C 1.0f // Single Issue

#define PERFSCORE_THROUGHPUT_2C 2.0f // slower - 2 cycles
#define PERFSCORE_THROUGHPUT_3C 3.0f // slower - 3 cycles
#define PERFSCORE_THROUGHPUT_4C 4.0f // slower - 4 cycles
#define PERFSCORE_THROUGHPUT_5C 5.0f // slower - 5 cycles
#define PERFSCORE_THROUGHPUT_6C 6.0f // slower - 6 cycles
#define PERFSCORE_THROUGHPUT_7C 7.0f // slower - 7 cycles
#define PERFSCORE_THROUGHPUT_8C 8.0f // slower - 8 cycles
#define PERFSCORE_THROUGHPUT_9C 9.0f // slower - 9 cycles
#define PERFSCORE_THROUGHPUT_10C 10.0f // slower - 10 cycles
#define PERFSCORE_THROUGHPUT_13C 13.0f // slower - 13 cycles
#define PERFSCORE_THROUGHPUT_19C 19.0f // slower - 19 cycles
#define PERFSCORE_THROUGHPUT_25C 25.0f // slower - 25 cycles
#define PERFSCORE_THROUGHPUT_33C 33.0f // slower - 33 cycles
#define PERFSCORE_THROUGHPUT_52C 52.0f // slower - 52 cycles
#define PERFSCORE_THROUGHPUT_57C 57.0f // slower - 57 cycles
#define PERFSCORE_THROUGHPUT_2C 2.0f // slower - 2 cycles
#define PERFSCORE_THROUGHPUT_3C 3.0f // slower - 3 cycles
#define PERFSCORE_THROUGHPUT_4C 4.0f // slower - 4 cycles
#define PERFSCORE_THROUGHPUT_5C 5.0f // slower - 5 cycles
#define PERFSCORE_THROUGHPUT_6C 6.0f // slower - 6 cycles
#define PERFSCORE_THROUGHPUT_7C 7.0f // slower - 7 cycles
#define PERFSCORE_THROUGHPUT_8C 8.0f // slower - 8 cycles
#define PERFSCORE_THROUGHPUT_9C 9.0f // slower - 9 cycles
#define PERFSCORE_THROUGHPUT_10C 10.0f // slower - 10 cycles
#define PERFSCORE_THROUGHPUT_13C 13.0f // slower - 13 cycles
#define PERFSCORE_THROUGHPUT_19C 19.0f // slower - 19 cycles
#define PERFSCORE_THROUGHPUT_25C 25.0f // slower - 25 cycles
#define PERFSCORE_THROUGHPUT_33C 33.0f // slower - 33 cycles
#define PERFSCORE_THROUGHPUT_52C 52.0f // slower - 52 cycles
#define PERFSCORE_THROUGHPUT_57C 57.0f // slower - 57 cycles
#define PERFSCORE_THROUGHPUT_140C 140.0f // slower - 140 cycles

#define PERFSCORE_LATENCY_ILLEGAL -1024.0f

Expand All @@ -1281,6 +1282,7 @@ class emitter
#define PERFSCORE_LATENCY_26C 26.0f
#define PERFSCORE_LATENCY_62C 62.0f
#define PERFSCORE_LATENCY_69C 69.0f
#define PERFSCORE_LATENCY_140C 140.0f
#define PERFSCORE_LATENCY_400C 400.0f // Intel microcode issue with these instuctions

#define PERFSCORE_LATENCY_BRANCH_DIRECT 1.0f // cost of an unconditional branch
Expand Down
6 changes: 6 additions & 0 deletions src/coreclr/jit/emitarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14588,6 +14588,12 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
result.insThroughput = PERFSCORE_THROUGHPUT_ZERO;
result.insLatency = PERFSCORE_LATENCY_ZERO;
}
else if (ins == INS_yield)
{
// @ToDo - find out the actual latency, match x86/x64 for now
result.insThroughput = PERFSCORE_THROUGHPUT_140C;
result.insLatency = PERFSCORE_LATENCY_140C;
}
else
{
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
Expand Down
14 changes: 11 additions & 3 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2898,7 +2898,8 @@ void emitter::emitIns(instruction ins)
ins == INS_r_movsp || ins == INS_r_stosb || ins == INS_r_stosd || ins == INS_r_stosp || ins == INS_ret ||
ins == INS_sahf || ins == INS_stosb || ins == INS_stosd || ins == INS_stosp
// These instructions take zero operands
|| ins == INS_vzeroupper || ins == INS_lfence || ins == INS_mfence || ins == INS_sfence);
|| ins == INS_vzeroupper || ins == INS_lfence || ins == INS_mfence || ins == INS_sfence ||
ins == INS_pause);

assert(assertCond);
}
Expand Down Expand Up @@ -12333,8 +12334,8 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
// Due to elided register moves, we can't have the following assert.
// For example, consider:
// t85 = LCL_VAR byref V01 arg1 rdx (last use) REG rdx
// /--* t85 byref
// * STORE_LCL_VAR byref V40 tmp31 rdx REG rdx
// /--* t85 byref
// * STORE_LCL_VAR byref V40 tmp31 rdx REG rdx
// Here, V01 is type `long` on entry, then is stored as a byref. But because
// the register allocator assigned the same register, no instruction was
// generated, and we only (currently) make gcref/byref changes in emitter GC info
Expand Down Expand Up @@ -16104,6 +16105,13 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
break;

case INS_pause:
{
result.insLatency = PERFSCORE_LATENCY_140C;
result.insThroughput = PERFSCORE_THROUGHPUT_140C;
break;
}

default:
// unhandled instruction insFmt combination
perfScoreUnhandledInstruction(id, &result);
Expand Down
6 changes: 6 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21798,6 +21798,12 @@ GenTree* Compiler::gtNewSimdZeroNode(var_types type,
return gtNewSimdHWIntrinsicNode(type, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
}

GenTreeHWIntrinsic* Compiler::gtNewScalarHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID)
{
return new (this, GT_HWINTRINSIC)
GenTreeHWIntrinsic(type, hwIntrinsicID, CORINFO_TYPE_UNDEF, 0, /* isSimdAsHWIntrinsic */ false);
}

GenTreeHWIntrinsic* Compiler::gtNewScalarHWIntrinsicNode(var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID)
{
SetOpLclRelatedToSIMDIntrinsic(op1);
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
Expand Up @@ -804,7 +804,7 @@ struct HWIntrinsic final

if (baseType == TYP_UNKNOWN)
{
assert(category == HW_Category_Scalar);
assert((category == HW_Category_Scalar) || (category == HW_Category_Special));

if (HWIntrinsicInfo::BaseTypeFromFirstArg(id))
{
Expand Down
24 changes: 20 additions & 4 deletions src/coreclr/jit/hwintrinsicarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,17 +308,23 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
var_types retType,
unsigned simdSize)
{
HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsic);
int numArgs = sig->numArgs;
var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsic);
int numArgs = sig->numArgs;

if (!featureSIMD || !IsBaselineSimdIsaSupported())
{
return nullptr;
}

assert(numArgs >= 0);
assert(varTypeIsArithmetic(simdBaseType));

var_types simdBaseType = TYP_UNKNOWN;

if (intrinsic != NI_ArmBase_Yield)
{
simdBaseType = JitType2PreciseVarType(simdBaseJitType);
assert(varTypeIsArithmetic(simdBaseType));
}

GenTree* retNode = nullptr;
GenTree* op1 = nullptr;
Expand All @@ -327,6 +333,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,

switch (intrinsic)
{
case NI_ArmBase_Yield:
{
assert(sig->numArgs == 0);
assert(JITtype2varType(sig->retType) == TYP_VOID);
assert(simdSize == 0);

retNode = gtNewScalarHWIntrinsicNode(TYP_VOID, intrinsic);
break;
}

case NI_Vector64_Abs:
case NI_Vector128_Abs:
{
Expand Down
19 changes: 19 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,13 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
emitSize = emitActualTypeSize(intrin.baseType);
opt = INS_OPTS_NONE;
}
else if (intrin.category == HW_Category_Special)
{
assert(intrin.id == NI_ArmBase_Yield);

emitSize = EA_UNKNOWN;
opt = INS_OPTS_NONE;
}
else
{
emitSize = emitActualTypeSize(Compiler::getSIMDTypeForSize(node->GetSimdSize()));
Expand Down Expand Up @@ -443,6 +450,12 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
}
break;

case NI_ArmBase_Yield:
{
ins = INS_yield;
break;
}

default:
ins = HWIntrinsicInfo::lookupIns(intrin.id, intrin.baseType);
break;
Expand Down Expand Up @@ -735,6 +748,12 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
}
break;

case NI_ArmBase_Yield:
{
GetEmitter()->emitIns(ins);
break;
}

// mvni doesn't support the range of element types, so hard code the 'opts' value.
case NI_Vector64_get_Zero:
case NI_Vector64_get_AllBitsSet:
Expand Down
21 changes: 16 additions & 5 deletions src/coreclr/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1447,6 +1447,8 @@ void CodeGen::genX86BaseIntrinsic(GenTreeHWIntrinsic* node)
{
NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;

genConsumeOperands(node);

switch (intrinsicId)
{
case NI_X86Base_BitScanForward:
Expand All @@ -1459,16 +1461,25 @@ void CodeGen::genX86BaseIntrinsic(GenTreeHWIntrinsic* node)
var_types targetType = node->TypeGet();
instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, targetType);

genConsumeOperands(node);
genHWIntrinsic_R_RM(node, ins, emitTypeSize(targetType), targetReg, op1);
genProduceReg(node);
break;
}

case NI_X86Base_Pause:
{
assert(node->GetSimdBaseType() == TYP_UNKNOWN);
assert(node->gtGetOp1() == nullptr);
assert(node->gtGetOp2() == nullptr);
GetEmitter()->emitIns(INS_pause);
break;
}

default:
unreached();
break;
}

genProduceReg(node);
}

//------------------------------------------------------------------------
Expand Down Expand Up @@ -1532,7 +1543,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)

case NI_SSE_StoreFence:
{
assert(baseType == TYP_VOID);
assert(baseType == TYP_UNKNOWN);
assert(op1 == nullptr);
assert(op2 == nullptr);
emit->emitIns(INS_sfence);
Expand Down Expand Up @@ -1617,7 +1628,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)

case NI_SSE2_LoadFence:
{
assert(baseType == TYP_VOID);
assert(baseType == TYP_UNKNOWN);
assert(op1 == nullptr);
assert(op2 == nullptr);
emit->emitIns(INS_lfence);
Expand All @@ -1626,7 +1637,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)

case NI_SSE2_MemoryFence:
{
assert(baseType == TYP_VOID);
assert(baseType == TYP_UNKNOWN);
assert(op1 == nullptr);
assert(op2 == nullptr);
emit->emitIns(INS_mfence);
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsiclistarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,7 @@ HARDWARE_INTRINSIC(Aes, PolynomialMultiplyWideningUpper,
// Base Intrinsics
HARDWARE_INTRINSIC(ArmBase, LeadingZeroCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_clz, INS_clz, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoFloatingPointUsed)
HARDWARE_INTRINSIC(ArmBase, ReverseElementBits, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rbit, INS_rbit, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed)
HARDWARE_INTRINSIC(ArmBase, Yield, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport)

// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// ISA Function name SIMD size Number of arguments Instructions Category Flags
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ HARDWARE_INTRINSIC(Vector256, Xor,
// X86Base Intrinsics
HARDWARE_INTRINSIC(X86Base, BitScanForward, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsf, INS_bsf, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, BitScanReverse, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bsr, INS_bsr, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(X86Base, Pause, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)

// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
// ISA Function name SIMD size NumArg Instructions Category Flags
Expand Down
24 changes: 20 additions & 4 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
{
case InstructionSet_Vector256:
case InstructionSet_Vector128:
case InstructionSet_X86Base:
return impBaseIntrinsic(intrinsic, clsHnd, method, sig, simdBaseJitType, retType, simdSize);
case InstructionSet_SSE:
return impSSEIntrinsic(intrinsic, method, sig);
Expand Down Expand Up @@ -548,8 +549,13 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
return nullptr;
}

var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
assert(varTypeIsArithmetic(simdBaseType));
var_types simdBaseType = TYP_UNKNOWN;

if (intrinsic != NI_X86Base_Pause)
{
simdBaseType = JitType2PreciseVarType(simdBaseJitType);
assert(varTypeIsArithmetic(simdBaseType));
}

switch (intrinsic)
{
Expand Down Expand Up @@ -1532,6 +1538,16 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_X86Base_Pause:
{
assert(sig->numArgs == 0);
assert(JITtype2varType(sig->retType) == TYP_VOID);
assert(simdSize == 0);

retNode = gtNewScalarHWIntrinsicNode(TYP_VOID, intrinsic);
break;
}

default:
{
return nullptr;
Expand Down Expand Up @@ -1604,7 +1620,7 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic intrinsic, CORINFO_METHOD_HAND
case NI_SSE_StoreFence:
assert(sig->numArgs == 0);
assert(JITtype2varType(sig->retType) == TYP_VOID);
retNode = gtNewSimdHWIntrinsicNode(TYP_VOID, intrinsic, CORINFO_TYPE_VOID, 0);
retNode = gtNewScalarHWIntrinsicNode(TYP_VOID, intrinsic);
break;

default:
Expand Down Expand Up @@ -1667,7 +1683,7 @@ GenTree* Compiler::impSSE2Intrinsic(NamedIntrinsic intrinsic, CORINFO_METHOD_HAN
assert(JITtype2varType(sig->retType) == TYP_VOID);
assert(simdSize == 0);

retNode = gtNewSimdHWIntrinsicNode(TYP_VOID, intrinsic, CORINFO_TYPE_VOID, simdSize);
retNode = gtNewScalarHWIntrinsicNode(TYP_VOID, intrinsic);
break;
}

Expand Down
3 changes: 3 additions & 0 deletions src/coreclr/jit/instrsarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -1563,6 +1563,9 @@ INST1(uxth, "uxth", 0, IF_DR_2H, 0x53003C00)
INST1(nop, "nop", 0, IF_SN_0A, 0xD503201F)
// nop SN_0A 1101010100000011 0010000000011111 D503 201F

INST1(yield, "yield", 0, IF_SN_0A, 0xD503203F)
// yield SN_0A 1101010100000011 0010000000111111 D503 203F

INST1(bkpt, "bkpt", 0, IF_SN_0A, 0xD43E0000)
// brpt SN_0A 1101010000111110 0000000000000000 D43E 0000 0xF000

Expand Down
Loading

0 comments on commit fdafc7c

Please sign in to comment.