Skip to content

Commit

Permalink
ARM64-SVE: Add GatherVectorWithByteOffsetFirstFaulting (#106199)
Browse files Browse the repository at this point in the history
Co-authored-by: Jakob Botsch Nielsen <Jakob.botsch.nielsen@gmail.com>
  • Loading branch information
amanasifkhalid and jakobbotsch committed Aug 12, 2024
1 parent adb1fee commit 6169e41
Show file tree
Hide file tree
Showing 12 changed files with 1,048 additions and 47 deletions.
1 change: 1 addition & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26690,6 +26690,7 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const
case NI_Sve_GatherVectorUInt16ZeroExtend:
case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend:
case NI_Sve_GatherVectorUInt32ZeroExtend:
case NI_Sve_GatherVectorWithByteOffsetFirstFaulting:
case NI_Sve_GatherVectorWithByteOffsets:
case NI_Sve_LoadVector:
case NI_Sve_LoadVectorNonTemporal:
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2185,6 +2185,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
case NI_Sve_GatherVectorUInt16ZeroExtend:
case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend:
case NI_Sve_GatherVectorUInt32ZeroExtend:
case NI_Sve_GatherVectorWithByteOffsetFirstFaulting:
case NI_Sve_GatherVectorWithByteOffsets:
assert(varTypeIsSIMD(op3->TypeGet()));
if (numArgs == 3)
Expand Down
8 changes: 5 additions & 3 deletions src/coreclr/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2090,14 +2090,16 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
case NI_Sve_GatherVectorUInt16ZeroExtend:
case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend:
case NI_Sve_GatherVectorUInt32ZeroExtend:
case NI_Sve_GatherVectorWithByteOffsetFirstFaulting:
{
if (!varTypeIsSIMD(intrin.op2->gtType))
{
// GatherVector...(Vector<T> mask, T* address, Vector<T2> indices)

emitAttr baseSize = emitActualTypeSize(intrin.baseType);
bool isLoadingBytes = ((ins == INS_sve_ld1b) || (ins == INS_sve_ld1sb) || (ins == INS_sve_ldff1b) ||
(ins == INS_sve_ldff1sb));
emitAttr baseSize = emitActualTypeSize(intrin.baseType);
bool isLoadingBytes =
((ins == INS_sve_ld1b) || (ins == INS_sve_ld1sb) || (ins == INS_sve_ldff1b) ||
(ins == INS_sve_ldff1sb) || (intrin.id == NI_Sve_GatherVectorWithByteOffsetFirstFaulting));
insScalableOpts sopt = INS_SCALABLE_OPTS_NONE;

if (baseSize == EA_4BYTE)
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsiclistarm64sve.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ HARDWARE_INTRINSIC(Sve, GatherVectorUInt16WithByteOffsetsZeroExtend,
HARDWARE_INTRINSIC(Sve, GatherVectorUInt16ZeroExtend, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1h, INS_sve_ld1h, INS_sve_ld1h, INS_sve_ld1h, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GatherVectorUInt32WithByteOffsetsZeroExtend, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1w, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GatherVectorUInt32ZeroExtend, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1w, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GatherVectorWithByteOffsetFirstFaulting, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ldff1w, INS_sve_ldff1w, INS_sve_ldff1d, INS_sve_ldff1d, INS_sve_ldff1w, INS_sve_ldff1d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_SpecialSideEffect_Other)
HARDWARE_INTRINSIC(Sve, GatherVectorWithByteOffsets, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1d, INS_sve_ld1d, INS_sve_ld1w, INS_sve_ld1d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, GetActiveElementCount, -1, 2, {INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp, INS_sve_cntp}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_ExplicitMaskedOperation)
HARDWARE_INTRINSIC(Sve, GetFfrByte, -1, 0, {INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffect_Other)
Expand Down
50 changes: 9 additions & 41 deletions src/coreclr/jit/lowerarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1782,6 +1782,14 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}
case NI_Sve_GatherVectorFirstFaulting:
case NI_Sve_GatherVectorWithByteOffsetFirstFaulting:
case NI_Sve_LoadVectorByteZeroExtendFirstFaulting:
case NI_Sve_LoadVectorFirstFaulting:
case NI_Sve_LoadVectorInt16SignExtendFirstFaulting:
case NI_Sve_LoadVectorInt32SignExtendFirstFaulting:
case NI_Sve_LoadVectorSByteSignExtendFirstFaulting:
case NI_Sve_LoadVectorUInt16ZeroExtendFirstFaulting:
case NI_Sve_LoadVectorUInt32ZeroExtendFirstFaulting:
{
LIR::Use use;
bool foundUse = BlockRange().TryGetUse(node, &use);
Expand Down Expand Up @@ -1825,47 +1833,6 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
StoreFFRValue(node);
break;
}
case NI_Sve_LoadVectorByteZeroExtendFirstFaulting:
case NI_Sve_LoadVectorFirstFaulting:
case NI_Sve_LoadVectorInt16SignExtendFirstFaulting:
case NI_Sve_LoadVectorInt32SignExtendFirstFaulting:
case NI_Sve_LoadVectorSByteSignExtendFirstFaulting:
case NI_Sve_LoadVectorUInt16ZeroExtendFirstFaulting:
case NI_Sve_LoadVectorUInt32ZeroExtendFirstFaulting:
{
LIR::Use use;
bool foundUse = BlockRange().TryGetUse(node, &use);

if (m_ffrTrashed)
{
// Consume the FFR register value from local variable to simulate "use" of FFR,
// only if it was trashed. If it was not trashed, we do not have to reload the
// contents of the FFR register.

unsigned lclNum = comp->getFFRegisterVarNum();
GenTree* lclVar = comp->gtNewLclvNode(lclNum, TYP_MASK);
BlockRange().InsertBefore(node, lclVar);
LowerNode(lclVar);

node->ResetHWIntrinsicId(intrinsicId, comp, node->Op(1), node->Op(2), lclVar);
}

if (foundUse)
{
unsigned tmpNum = comp->lvaGrabTemp(true DEBUGARG("Return value result/FFR"));
LclVarDsc* tmpVarDsc = comp->lvaGetDesc(tmpNum);
tmpVarDsc->lvType = node->TypeGet();
GenTree* storeLclVar;
use.ReplaceWithLclVar(comp, tmpNum, &storeLclVar);
}
else
{
node->SetUnusedValue();
}

StoreFFRValue(node);
break;
}
default:
break;
}
Expand Down Expand Up @@ -4146,6 +4113,7 @@ void Lowering::StoreFFRValue(GenTreeHWIntrinsic* node)
switch (node->GetHWIntrinsicId())
{
case NI_Sve_GatherVectorFirstFaulting:
case NI_Sve_GatherVectorWithByteOffsetFirstFaulting:
case NI_Sve_LoadVectorByteZeroExtendFirstFaulting:
case NI_Sve_LoadVectorFirstFaulting:
case NI_Sve_LoadVectorInt16SignExtendFirstFaulting:
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/lsraarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2113,6 +2113,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
case NI_Sve_GatherVectorUInt16ZeroExtend:
case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend:
case NI_Sve_GatherVectorUInt32ZeroExtend:
case NI_Sve_GatherVectorWithByteOffsetFirstFaulting:
assert(intrinsicTree->OperIsMemoryLoadOrStore());
FALLTHROUGH;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4290,7 +4290,82 @@ internal Arm64() { }
public static unsafe Vector<ulong> GatherVectorUInt32ZeroExtend(Vector<ulong> mask, uint* address, Vector<ulong> indices) { throw new PlatformNotSupportedException(); }


// Unextended load
/// Unextended load, first-faulting

/// <summary>
/// svfloat64_t svldff1_gather_[s64]offset[_f64](svbool_t pg, const float64_t *base, svint64_t offsets)
/// LDFF1D Zresult.D, Pg/Z, [Xbase, Zoffsets.D]
/// </summary>
public static unsafe Vector<double> GatherVectorWithByteOffsetFirstFaulting(Vector<double> mask, double* address, Vector<long> offsets) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svfloat64_t svldff1_gather_[u64]offset[_f64](svbool_t pg, const float64_t *base, svuint64_t offsets)
/// LDFF1D Zresult.D, Pg/Z, [Xbase, Zoffsets.D]
/// </summary>
public static unsafe Vector<double> GatherVectorWithByteOffsetFirstFaulting(Vector<double> mask, double* address, Vector<ulong> offsets) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svint32_t svldff1_gather_[s32]offset[_s32](svbool_t pg, const int32_t *base, svint32_t offsets)
/// LDFF1W Zresult.S, Pg/Z, [Xbase, Zoffsets.S, SXTW]
/// </summary>
public static unsafe Vector<int> GatherVectorWithByteOffsetFirstFaulting(Vector<int> mask, int* address, Vector<int> offsets) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svint32_t svldff1_gather_[u32]offset[_s32](svbool_t pg, const int32_t *base, svuint32_t offsets)
/// LDFF1W Zresult.S, Pg/Z, [Xbase, Zoffsets.S, UXTW]
/// </summary>
public static unsafe Vector<int> GatherVectorWithByteOffsetFirstFaulting(Vector<int> mask, int* address, Vector<uint> offsets) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svint64_t svldff1_gather_[s64]offset[_s64](svbool_t pg, const int64_t *base, svint64_t offsets)
/// LDFF1D Zresult.D, Pg/Z, [Xbase, Zoffsets.D]
/// </summary>
public static unsafe Vector<long> GatherVectorWithByteOffsetFirstFaulting(Vector<long> mask, long* address, Vector<long> offsets) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svint64_t svldff1_gather_[u64]offset[_s64](svbool_t pg, const int64_t *base, svuint64_t offsets)
/// LDFF1D Zresult.D, Pg/Z, [Xbase, Zoffsets.D]
/// </summary>
public static unsafe Vector<long> GatherVectorWithByteOffsetFirstFaulting(Vector<long> mask, long* address, Vector<ulong> offsets) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svfloat32_t svldff1_gather_[s32]offset[_f32](svbool_t pg, const float32_t *base, svint32_t offsets)
/// LDFF1W Zresult.S, Pg/Z, [Xbase, Zoffsets.S, SXTW]
/// </summary>
public static unsafe Vector<float> GatherVectorWithByteOffsetFirstFaulting(Vector<float> mask, float* address, Vector<int> offsets) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svfloat32_t svldff1_gather_[u32]offset[_f32](svbool_t pg, const float32_t *base, svuint32_t offsets)
/// LDFF1W Zresult.S, Pg/Z, [Xbase, Zoffsets.S, UXTW]
/// </summary>
public static unsafe Vector<float> GatherVectorWithByteOffsetFirstFaulting(Vector<float> mask, float* address, Vector<uint> offsets) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svuint32_t svldff1_gather_[s32]offset[_u32](svbool_t pg, const uint32_t *base, svint32_t offsets)
/// LDFF1W Zresult.S, Pg/Z, [Xbase, Zoffsets.S, SXTW]
/// </summary>
public static unsafe Vector<uint> GatherVectorWithByteOffsetFirstFaulting(Vector<uint> mask, uint* address, Vector<int> offsets) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svuint32_t svldff1_gather_[u32]offset[_u32](svbool_t pg, const uint32_t *base, svuint32_t offsets)
/// LDFF1W Zresult.S, Pg/Z, [Xbase, Zoffsets.S, UXTW]
/// </summary>
public static unsafe Vector<uint> GatherVectorWithByteOffsetFirstFaulting(Vector<uint> mask, uint* address, Vector<uint> offsets) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svuint64_t svldff1_gather_[s64]offset[_u64](svbool_t pg, const uint64_t *base, svint64_t offsets)
/// LDFF1D Zresult.D, Pg/Z, [Xbase, Zoffsets.D]
/// </summary>
public static unsafe Vector<ulong> GatherVectorWithByteOffsetFirstFaulting(Vector<ulong> mask, ulong* address, Vector<long> offsets) { throw new PlatformNotSupportedException(); }

/// <summary>
/// svuint64_t svldff1_gather_[u64]offset[_u64](svbool_t pg, const uint64_t *base, svuint64_t offsets)
/// LDFF1D Zresult.D, Pg/Z, [Xbase, Zoffsets.D]
/// </summary>
public static unsafe Vector<ulong> GatherVectorWithByteOffsetFirstFaulting(Vector<ulong> mask, ulong* address, Vector<ulong> offsets) { throw new PlatformNotSupportedException(); }


/// Unextended load

/// <summary>
/// svfloat64_t svld1_gather_[s64]offset[_f64](svbool_t pg, const float64_t *base, svint64_t offsets)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4287,7 +4287,82 @@ internal Arm64() { }
public static unsafe Vector<ulong> GatherVectorUInt32ZeroExtend(Vector<ulong> mask, uint* address, Vector<ulong> indices) => GatherVectorUInt32ZeroExtend(mask, address, indices);


// Unextended load
/// Unextended load, first-faulting

/// <summary>
/// svfloat64_t svldff1_gather_[s64]offset[_f64](svbool_t pg, const float64_t *base, svint64_t offsets)
/// LDFF1D Zresult.D, Pg/Z, [Xbase, Zoffsets.D]
/// </summary>
public static unsafe Vector<double> GatherVectorWithByteOffsetFirstFaulting(Vector<double> mask, double* address, Vector<long> offsets) => GatherVectorWithByteOffsetFirstFaulting(mask, address, offsets);

/// <summary>
/// svfloat64_t svldff1_gather_[u64]offset[_f64](svbool_t pg, const float64_t *base, svuint64_t offsets)
/// LDFF1D Zresult.D, Pg/Z, [Xbase, Zoffsets.D]
/// </summary>
public static unsafe Vector<double> GatherVectorWithByteOffsetFirstFaulting(Vector<double> mask, double* address, Vector<ulong> offsets) => GatherVectorWithByteOffsetFirstFaulting(mask, address, offsets);

/// <summary>
/// svint32_t svldff1_gather_[s32]offset[_s32](svbool_t pg, const int32_t *base, svint32_t offsets)
/// LDFF1W Zresult.S, Pg/Z, [Xbase, Zoffsets.S, SXTW]
/// </summary>
public static unsafe Vector<int> GatherVectorWithByteOffsetFirstFaulting(Vector<int> mask, int* address, Vector<int> offsets) => GatherVectorWithByteOffsetFirstFaulting(mask, address, offsets);

/// <summary>
/// svint32_t svldff1_gather_[u32]offset[_s32](svbool_t pg, const int32_t *base, svuint32_t offsets)
/// LDFF1W Zresult.S, Pg/Z, [Xbase, Zoffsets.S, UXTW]
/// </summary>
public static unsafe Vector<int> GatherVectorWithByteOffsetFirstFaulting(Vector<int> mask, int* address, Vector<uint> offsets) => GatherVectorWithByteOffsetFirstFaulting(mask, address, offsets);

/// <summary>
/// svint64_t svldff1_gather_[s64]offset[_s64](svbool_t pg, const int64_t *base, svint64_t offsets)
/// LDFF1D Zresult.D, Pg/Z, [Xbase, Zoffsets.D]
/// </summary>
public static unsafe Vector<long> GatherVectorWithByteOffsetFirstFaulting(Vector<long> mask, long* address, Vector<long> offsets) => GatherVectorWithByteOffsetFirstFaulting(mask, address, offsets);

/// <summary>
/// svint64_t svldff1_gather_[u64]offset[_s64](svbool_t pg, const int64_t *base, svuint64_t offsets)
/// LDFF1D Zresult.D, Pg/Z, [Xbase, Zoffsets.D]
/// </summary>
public static unsafe Vector<long> GatherVectorWithByteOffsetFirstFaulting(Vector<long> mask, long* address, Vector<ulong> offsets) => GatherVectorWithByteOffsetFirstFaulting(mask, address, offsets);

/// <summary>
/// svfloat32_t svldff1_gather_[s32]offset[_f32](svbool_t pg, const float32_t *base, svint32_t offsets)
/// LDFF1W Zresult.S, Pg/Z, [Xbase, Zoffsets.S, SXTW]
/// </summary>
public static unsafe Vector<float> GatherVectorWithByteOffsetFirstFaulting(Vector<float> mask, float* address, Vector<int> offsets) => GatherVectorWithByteOffsetFirstFaulting(mask, address, offsets);

/// <summary>
/// svfloat32_t svldff1_gather_[u32]offset[_f32](svbool_t pg, const float32_t *base, svuint32_t offsets)
/// LDFF1W Zresult.S, Pg/Z, [Xbase, Zoffsets.S, UXTW]
/// </summary>
public static unsafe Vector<float> GatherVectorWithByteOffsetFirstFaulting(Vector<float> mask, float* address, Vector<uint> offsets) => GatherVectorWithByteOffsetFirstFaulting(mask, address, offsets);

/// <summary>
/// svuint32_t svldff1_gather_[s32]offset[_u32](svbool_t pg, const uint32_t *base, svint32_t offsets)
/// LDFF1W Zresult.S, Pg/Z, [Xbase, Zoffsets.S, SXTW]
/// </summary>
public static unsafe Vector<uint> GatherVectorWithByteOffsetFirstFaulting(Vector<uint> mask, uint* address, Vector<int> offsets) => GatherVectorWithByteOffsetFirstFaulting(mask, address, offsets);

/// <summary>
/// svuint32_t svldff1_gather_[u32]offset[_u32](svbool_t pg, const uint32_t *base, svuint32_t offsets)
/// LDFF1W Zresult.S, Pg/Z, [Xbase, Zoffsets.S, UXTW]
/// </summary>
public static unsafe Vector<uint> GatherVectorWithByteOffsetFirstFaulting(Vector<uint> mask, uint* address, Vector<uint> offsets) => GatherVectorWithByteOffsetFirstFaulting(mask, address, offsets);

/// <summary>
/// svuint64_t svldff1_gather_[s64]offset[_u64](svbool_t pg, const uint64_t *base, svint64_t offsets)
/// LDFF1D Zresult.D, Pg/Z, [Xbase, Zoffsets.D]
/// </summary>
public static unsafe Vector<ulong> GatherVectorWithByteOffsetFirstFaulting(Vector<ulong> mask, ulong* address, Vector<long> offsets) => GatherVectorWithByteOffsetFirstFaulting(mask, address, offsets);

/// <summary>
/// svuint64_t svldff1_gather_[u64]offset[_u64](svbool_t pg, const uint64_t *base, svuint64_t offsets)
/// LDFF1D Zresult.D, Pg/Z, [Xbase, Zoffsets.D]
/// </summary>
public static unsafe Vector<ulong> GatherVectorWithByteOffsetFirstFaulting(Vector<ulong> mask, ulong* address, Vector<ulong> offsets) => GatherVectorWithByteOffsetFirstFaulting(mask, address, offsets);


/// Unextended load

/// <summary>
/// svfloat64_t svld1_gather_[s64]offset[_f64](svbool_t pg, const float64_t *base, svint64_t offsets)
Expand Down
Loading

0 comments on commit 6169e41

Please sign in to comment.