Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SVE: Added Load2xVectorAndUnzip, Load3xVectorAndUnzip, Load4xVectorAndUnzip APIs #102180

Merged
merged 24 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26594,6 +26594,9 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const
case NI_Sve_LoadVectorUInt16ZeroExtendToUInt64:
case NI_Sve_LoadVectorUInt32ZeroExtendToInt64:
case NI_Sve_LoadVectorUInt32ZeroExtendToUInt64:
case NI_Sve_Load2xVectorAndUnzip:
case NI_Sve_Load3xVectorAndUnzip:
case NI_Sve_Load4xVectorAndUnzip:
addr = Op(2);
break;
#endif // TARGET_ARM64
Expand Down Expand Up @@ -27115,6 +27118,13 @@ ClassLayout* GenTreeHWIntrinsic::GetLayout(Compiler* compiler) const
case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x4:
return compiler->typGetBlkLayout(64);

case NI_Sve_Load2xVectorAndUnzip:
return compiler->typGetBlkLayout(compiler->getVectorTByteLength() * 2);
TIHan marked this conversation as resolved.
Show resolved Hide resolved
case NI_Sve_Load3xVectorAndUnzip:
return compiler->typGetBlkLayout(compiler->getVectorTByteLength() * 3);
case NI_Sve_Load4xVectorAndUnzip:
return compiler->typGetBlkLayout(compiler->getVectorTByteLength() * 4);

#endif // TARGET_ARM64

default:
Expand Down
8 changes: 8 additions & 0 deletions src/coreclr/jit/hwintrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1603,6 +1603,14 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
// Op1 input is a vector. HWInstrinsic requires a mask.
retNode->AsHWIntrinsic()->Op(1) = gtNewSimdConvertVectorToMaskNode(retType, op1, simdBaseJitType, simdSize);
}

if (HWIntrinsicInfo::IsMultiReg(intrinsic))
{
assert(HWIntrinsicInfo::IsExplicitMaskedOperation(retNode->AsHWIntrinsic()->GetHWIntrinsicId()));
assert(HWIntrinsicInfo::IsMultiReg(retNode->AsHWIntrinsic()->GetHWIntrinsicId()));
retNode =
impStoreMultiRegValueToVar(retNode, sig->retTypeSigClass DEBUGARG(CorInfoCallConvExtension::Managed));
}
}

if (retType != nodeRetType)
Expand Down
3 changes: 3 additions & 0 deletions src/coreclr/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
Expand Up @@ -831,6 +831,7 @@ struct HWIntrinsicInfo
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2:
case NI_AdvSimd_LoadAndReplicateToVector64x2:
case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x2:
case NI_Sve_Load2xVectorAndUnzip:
return 2;

case NI_AdvSimd_LoadVector64x3AndUnzip:
Expand All @@ -841,6 +842,7 @@ struct HWIntrinsicInfo
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3:
case NI_AdvSimd_LoadAndReplicateToVector64x3:
case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x3:
case NI_Sve_Load3xVectorAndUnzip:
return 3;

case NI_AdvSimd_LoadVector64x4AndUnzip:
Expand All @@ -851,6 +853,7 @@ struct HWIntrinsicInfo
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4:
case NI_AdvSimd_LoadAndReplicateToVector64x4:
case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x4:
case NI_Sve_Load4xVectorAndUnzip:
return 4;
#endif

Expand Down
29 changes: 29 additions & 0 deletions src/coreclr/jit/hwintrinsicarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2296,6 +2296,35 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
retNode = impStoreMultiRegValueToVar(op1, sig->retTypeSigClass DEBUGARG(CorInfoCallConvExtension::Managed));
break;
}

case NI_Sve_Load2xVectorAndUnzip:
case NI_Sve_Load3xVectorAndUnzip:
case NI_Sve_Load4xVectorAndUnzip:
{
info.compNeedsConsecutiveRegisters = true;

assert(sig->numArgs == 2);

op2 = impPopStack().val;
op1 = impPopStack().val;

if (op2->OperIs(GT_CAST))
{
// Although the API specifies a pointer, if what we have is a BYREF, that's what
// we really want, so throw away the cast.
if (op2->gtGetOp1()->TypeGet() == TYP_BYREF)
{
op2 = op2->gtGetOp1();
}
}

assert(HWIntrinsicInfo::IsMultiReg(intrinsic));
assert(HWIntrinsicInfo::IsExplicitMaskedOperation(intrinsic));

retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize);
break;
}

case NI_AdvSimd_LoadAndInsertScalarVector64x2:
case NI_AdvSimd_LoadAndInsertScalarVector64x3:
case NI_AdvSimd_LoadAndInsertScalarVector64x4:
Expand Down
20 changes: 20 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1356,6 +1356,26 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
}
break;

case NI_Sve_Load2xVectorAndUnzip:
case NI_Sve_Load3xVectorAndUnzip:
case NI_Sve_Load4xVectorAndUnzip:
{
#ifdef DEBUG
// Validates that consecutive registers were used properly.

assert(node->GetMultiRegCount(compiler) == (unsigned int)GetEmitter()->insGetSveReg1ListSize(ins));

regNumber argReg = targetReg;
for (unsigned int i = 0; i < node->GetMultiRegCount(compiler); i++)
{
assert(argReg == node->GetRegNumByIdx(i));
argReg = getNextSIMDRegWithWraparound(argReg);
}
#endif // DEBUG
GetEmitter()->emitIns_R_R_R_I(ins, emitSize, targetReg, op1Reg, op2Reg, 0, opt);
break;
}

case NI_Sve_StoreAndZipx2:
case NI_Sve_StoreAndZipx3:
case NI_Sve_StoreAndZipx4:
Expand Down
3 changes: 3 additions & 0 deletions src/coreclr/jit/hwintrinsiclistarm64sve.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ HARDWARE_INTRINSIC(Sve, LoadVectorUInt16ZeroExtendToUInt32,
HARDWARE_INTRINSIC(Sve, LoadVectorUInt16ZeroExtendToUInt64, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1h, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, LoadVectorUInt32ZeroExtendToInt64, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, LoadVectorUInt32ZeroExtendToUInt64, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, Load2xVectorAndUnzip, -1, 2, true, {INS_sve_ld2b, INS_sve_ld2b, INS_sve_ld2h, INS_sve_ld2h, INS_sve_ld2w, INS_sve_ld2w, INS_sve_ld2d, INS_sve_ld2d, INS_sve_ld2w, INS_sve_ld2d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_MultiReg|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_NeedsConsecutiveRegisters|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Sve, Load3xVectorAndUnzip, -1, 2, true, {INS_sve_ld3b, INS_sve_ld3b, INS_sve_ld3h, INS_sve_ld3h, INS_sve_ld3w, INS_sve_ld3w, INS_sve_ld3d, INS_sve_ld3d, INS_sve_ld3w, INS_sve_ld3d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_MultiReg|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_NeedsConsecutiveRegisters|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Sve, Load4xVectorAndUnzip, -1, 2, true, {INS_sve_ld4b, INS_sve_ld4b, INS_sve_ld4h, INS_sve_ld4h, INS_sve_ld4w, INS_sve_ld4w, INS_sve_ld4d, INS_sve_ld4d, INS_sve_ld4w, INS_sve_ld4d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_MultiReg|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_NeedsConsecutiveRegisters|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Sve, Max, -1, -1, false, {INS_sve_smax, INS_sve_umax, INS_sve_smax, INS_sve_umax, INS_sve_smax, INS_sve_umax, INS_sve_smax, INS_sve_umax, INS_sve_fmax, INS_sve_fmax}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, MaxAcross, -1, -1, false, {INS_sve_smaxv, INS_sve_umaxv, INS_sve_smaxv, INS_sve_umaxv, INS_sve_smaxv, INS_sve_umaxv, INS_sve_smaxv, INS_sve_umaxv, INS_sve_fmaxv, INS_sve_fmaxv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, MaxNumber, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmaxnm, INS_sve_fmaxnm}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
Expand Down
13 changes: 13 additions & 0 deletions src/coreclr/jit/lsraarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1780,6 +1780,19 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
break;
}

case NI_Sve_Load2xVectorAndUnzip:
case NI_Sve_Load3xVectorAndUnzip:
case NI_Sve_Load4xVectorAndUnzip:
{
assert(intrin.op1 != nullptr);
assert(intrin.op2 != nullptr);
assert(intrinsicTree->OperIsMemoryLoadOrStore());
srcCount += BuildAddrUses(intrin.op2);
BuildConsecutiveRegistersForDef(intrinsicTree, dstCount);
*pDstCount = dstCount;
break;
}

case NI_Sve_StoreAndZipx2:
case NI_Sve_StoreAndZipx3:
case NI_Sve_StoreAndZipx4:
Expand Down
Loading
Loading