Skip to content

Commit

Permalink
JIT ARM64-SVE: Add AddAcross
Browse files Browse the repository at this point in the history
  • Loading branch information
a74nh committed Apr 29, 2024
1 parent 7745b5e commit a3d0161
Show file tree
Hide file tree
Showing 11 changed files with 731 additions and 110 deletions.
16 changes: 8 additions & 8 deletions src/coreclr/jit/codegenarm64test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5314,11 +5314,11 @@ void CodeGen::genArm64EmitterUnitTestsSve()
#endif // ALL_ARM64_EMITTER_UNIT_TESTS_SVE_UNSUPPORTED

// IF_SVE_AI_3A
theEmitter->emitIns_R_R_R(INS_sve_saddv, EA_1BYTE, REG_V1, REG_P4, REG_V2,
theEmitter->emitIns_R_R_R(INS_sve_saddv, EA_SCALABLE, REG_V1, REG_P4, REG_V2,
INS_OPTS_SCALABLE_B); // SADDV <Dd>, <Pg>, <Zn>.<T>
theEmitter->emitIns_R_R_R(INS_sve_saddv, EA_2BYTE, REG_V2, REG_P5, REG_V3,
theEmitter->emitIns_R_R_R(INS_sve_saddv, EA_SCALABLE, REG_V2, REG_P5, REG_V3,
INS_OPTS_SCALABLE_H); // SADDV <Dd>, <Pg>, <Zn>.<T>
theEmitter->emitIns_R_R_R(INS_sve_uaddv, EA_4BYTE, REG_V3, REG_P6, REG_V4,
theEmitter->emitIns_R_R_R(INS_sve_uaddv, EA_SCALABLE, REG_V3, REG_P6, REG_V4,
INS_OPTS_SCALABLE_S); // UADDV <Dd>, <Pg>, <Zn>.<T>

// IF_SVE_AJ_3A
Expand Down Expand Up @@ -6768,15 +6768,15 @@ void CodeGen::genArm64EmitterUnitTestsSve()
#endif // ALL_ARM64_EMITTER_UNIT_TESTS_SVE_UNSUPPORTED

// IF_SVE_HE_3A
theEmitter->emitIns_R_R_R(INS_sve_faddv, EA_2BYTE, REG_V21, REG_P7, REG_V7,
theEmitter->emitIns_R_R_R(INS_sve_faddv, EA_SCALABLE, REG_V21, REG_P7, REG_V7,
INS_OPTS_SCALABLE_H); // FADDV <V><d>, <Pg>, <Zn>.<T>
theEmitter->emitIns_R_R_R(INS_sve_fmaxnmv, EA_2BYTE, REG_V22, REG_P6, REG_V6,
theEmitter->emitIns_R_R_R(INS_sve_fmaxnmv, EA_SCALABLE, REG_V22, REG_P6, REG_V6,
INS_OPTS_SCALABLE_H); // FMAXNMV <V><d>, <Pg>, <Zn>.<T>
theEmitter->emitIns_R_R_R(INS_sve_fmaxv, EA_4BYTE, REG_V23, REG_P5, REG_V5,
theEmitter->emitIns_R_R_R(INS_sve_fmaxv, EA_SCALABLE, REG_V23, REG_P5, REG_V5,
INS_OPTS_SCALABLE_S); // FMAXV <V><d>, <Pg>, <Zn>.<T>
theEmitter->emitIns_R_R_R(INS_sve_fminnmv, EA_8BYTE, REG_V24, REG_P4, REG_V4,
theEmitter->emitIns_R_R_R(INS_sve_fminnmv, EA_SCALABLE, REG_V24, REG_P4, REG_V4,
INS_OPTS_SCALABLE_D); // FMINNMV <V><d>, <Pg>, <Zn>.<T>
theEmitter->emitIns_R_R_R(INS_sve_fminv, EA_4BYTE, REG_V25, REG_P3, REG_V3,
theEmitter->emitIns_R_R_R(INS_sve_fminv, EA_SCALABLE, REG_V25, REG_P3, REG_V3,
INS_OPTS_SCALABLE_S); // FMINV <V><d>, <Pg>, <Zn>.<T>

// IF_SVE_HQ_3A
Expand Down
29 changes: 23 additions & 6 deletions src/coreclr/jit/emitarm64sve.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3060,7 +3060,6 @@ void emitter::emitInsSve_R_R_R(instruction ins,
break;

case INS_sve_saddv:
case INS_sve_uaddv:
assert(isFloatReg(reg1));
assert(isLowPredicateRegister(reg2));
assert(isVectorRegister(reg3));
Expand All @@ -3069,6 +3068,15 @@ void emitter::emitInsSve_R_R_R(instruction ins,
fmt = IF_SVE_AI_3A;
break;

case INS_sve_uaddv:
assert(isFloatReg(reg1));
assert(isLowPredicateRegister(reg2));
assert(isVectorRegister(reg3));
assert(insOptsScalableStandard(opt));
assert(insScalableOptsNone(sopt));
fmt = IF_SVE_AI_3A;
break;

case INS_sve_addqv:
unreached(); // TODO-SVE: Not yet supported.
assert(isVectorRegister(reg1));
Expand Down Expand Up @@ -4059,7 +4067,7 @@ void emitter::emitInsSve_R_R_R(instruction ins,
assert(isLowPredicateRegister(reg2));
assert(isVectorRegister(reg3));
assert(insOptsScalableFloat(opt));
assert(isValidVectorElemsizeSveFloat(size));
assert(isScalableVectorSize(size));
assert(insScalableOptsNone(sopt));
fmt = IF_SVE_HE_3A;
break;
Expand All @@ -4069,7 +4077,7 @@ void emitter::emitInsSve_R_R_R(instruction ins,
assert(isLowPredicateRegister(reg2));
assert(isVectorRegister(reg3));
assert(insOptsScalableFloat(opt));
assert(isValidVectorElemsizeSveFloat(size));
assert(isScalableVectorSize(size));
assert(insScalableOptsNone(sopt));
fmt = IF_SVE_HJ_3A;
break;
Expand Down Expand Up @@ -12618,7 +12626,7 @@ void emitter::emitInsSveSanityCheck(instrDesc* id)
assert(isVectorRegister(id->idReg1())); // ddddd
assert(isLowPredicateRegister(id->idReg2())); // ggg
assert(isVectorRegister(id->idReg3())); // mmmmm
assert(isValidVectorElemsizeSveFloat(id->idOpSize()));
assert(isScalableVectorSize(id->idOpSize()));
break;

// Scalable to general register.
Expand Down Expand Up @@ -13211,11 +13219,20 @@ void emitter::emitInsSveSanityCheck(instrDesc* id)

// Scalable, widening to scalar SIMD.
case IF_SVE_AI_3A: // ........xx...... ...gggnnnnnddddd -- SVE integer add reduction (predicated)
assert(insOptsScalableWide(id->idInsOpt())); // xx
switch (id->idIns())
{
case INS_sve_saddv:
assert(insOptsScalableWide(id->idInsOpt())); // xx
break;

default:
assert(insOptsScalableStandard(id->idInsOpt())); // xx
break;
}
assert(isVectorRegister(id->idReg1())); // ddddd
assert(isLowPredicateRegister(id->idReg2())); // ggg
assert(isVectorRegister(id->idReg3())); // mmmmm
assert(isValidVectorElemsizeWidening(id->idOpSize()));
assert(isScalableVectorSize(id->idOpSize()));
break;

// Scalable, possibly FP.
Expand Down
145 changes: 73 additions & 72 deletions src/coreclr/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,175 +70,176 @@ enum HWIntrinsicCategory : uint8_t
#else
#error Unsupported platform
#endif

enum HWIntrinsicFlag : unsigned int
{
HW_Flag_NoFlag = 0,

// Commutative
// - if a binary-op intrinsic is commutative (e.g., Add, Multiply), its op1 can be contained
HW_Flag_Commutative = 0x1,
HW_Flag_Commutative = (1 << 0),

// NoCodeGen
// - should be transformed in the compiler front-end, cannot reach CodeGen
HW_Flag_NoCodeGen = 0x2,
HW_Flag_NoCodeGen = (1 << 1),

// Multi-instruction
// - that one intrinsic can generate multiple instructions
HW_Flag_MultiIns = 0x4,
HW_Flag_MultiIns = (1 << 2),

// Select base type using the first argument type
HW_Flag_BaseTypeFromFirstArg = 0x8,
HW_Flag_BaseTypeFromFirstArg = (1 << 3),

// Select base type using the second argument type
HW_Flag_BaseTypeFromSecondArg = 0x10,
HW_Flag_BaseTypeFromSecondArg = (1 << 4),

// Indicates compFloatingPointUsed does not need to be set.
HW_Flag_NoFloatingPointUsed = 0x20,
HW_Flag_NoFloatingPointUsed = (1 << 5),

// NoJmpTable IMM
// the imm intrinsic does not need jumptable fallback when it gets non-const argument
HW_Flag_NoJmpTableIMM = 0x40,
HW_Flag_NoJmpTableIMM = (1 << 6),

// Special codegen
// the intrinsics need special rules in CodeGen,
// but may be table-driven in the front-end
HW_Flag_SpecialCodeGen = 0x80,
HW_Flag_SpecialCodeGen = (1 << 7),

// Special import
// the intrinsics need special rules in importer,
// but may be table-driven in the back-end
HW_Flag_SpecialImport = 0x100,
HW_Flag_SpecialImport = (1 << 8),

// The intrinsic returns result in multiple registers.
HW_Flag_MultiReg = 0x200,
HW_Flag_MultiReg = (1 << 9),

// The intrinsic has some barrier special side effect that should be tracked
HW_Flag_SpecialSideEffect_Barrier = (1 << 10),

// The intrinsic has some other special side effect that should be tracked
HW_Flag_SpecialSideEffect_Other = (1 << 11),

HW_Flag_SpecialSideEffectMask = (HW_Flag_SpecialSideEffect_Barrier | HW_Flag_SpecialSideEffect_Other),

// The below is for defining platform-specific flags
// MaybeNoJmpTable IMM
// the imm intrinsic may not need jumptable fallback when it gets non-const argument
HW_Flag_MaybeNoJmpTableIMM = (1 << 12),

HW_Flag_CanBenefitFromConstantProp = (1 << 13),

// Used as a base for shifting the platform specific flags.
HW_Flag_PlatformBase = 13,
#define HW_TARGET_FLAG(id) (unsigned int)(1 << (id + HW_Flag_PlatformBase))

// Platform-specific flags
#if defined(TARGET_XARCH)
// Full range IMM intrinsic
// - the immediate value is valid on the full range of imm8 (0-255)
HW_Flag_FullRangeIMM = 0x400,
HW_Flag_FullRangeIMM = HW_TARGET_FLAG(1),

// Maybe IMM
// the intrinsic has either imm or Vector overloads
HW_Flag_MaybeIMM = 0x800,
HW_Flag_MaybeIMM = HW_TARGET_FLAG(2),

// Copy Upper bits
// some SIMD scalar intrinsics need the semantics of copying upper bits from the source operand
HW_Flag_CopyUpperBits = 0x1000,
HW_Flag_CopyUpperBits = HW_TARGET_FLAG(3),

// Maybe Memory Load/Store
// - some intrinsics may have pointer overloads but without HW_Category_MemoryLoad/HW_Category_MemoryStore
HW_Flag_MaybeMemoryLoad = 0x2000,
HW_Flag_MaybeMemoryStore = 0x4000,
HW_Flag_MaybeMemoryLoad = HW_TARGET_FLAG(4),
HW_Flag_MaybeMemoryStore = HW_TARGET_FLAG(5),

// No Read/Modify/Write Semantics
// the intrinsic doesn't have read/modify/write semantics in two/three-operand form.
HW_Flag_NoRMWSemantics = 0x8000,
HW_Flag_NoRMWSemantics = HW_TARGET_FLAG(6),

// NoContainment
// the intrinsic cannot be handled by containment,
// all the intrinsic that have explicit memory load/store semantics should have this flag
HW_Flag_NoContainment = 0x10000,
HW_Flag_NoContainment = HW_TARGET_FLAG(7),

// Returns Per-Element Mask
// the intrinsic returns a vector containing elements that are either "all bits set" or "all bits clear"
// this output can be used as a per-element mask
HW_Flag_ReturnsPerElementMask = 0x20000,
HW_Flag_ReturnsPerElementMask = HW_TARGET_FLAG(8),

// AvxOnlyCompatible
// the intrinsic can be used on hardware with AVX but not AVX2 support
HW_Flag_AvxOnlyCompatible = 0x40000,
HW_Flag_AvxOnlyCompatible = HW_TARGET_FLAG(9),

// MaybeCommutative
// - if a binary-op intrinsic is maybe commutative (e.g., Max or Min for float/double), its op1 can possibly be
// contained
HW_Flag_MaybeCommutative = 0x80000,
HW_Flag_MaybeCommutative = HW_TARGET_FLAG(10),

// The intrinsic has no EVEX compatible form
HW_Flag_NoEvexSemantics = 0x100000,
HW_Flag_NoEvexSemantics = HW_TARGET_FLAG(11),

// The intrinsic is an RMW intrinsic
HW_Flag_RmwIntrinsic = HW_TARGET_FLAG(12),

// The intrinsic is a FusedMultiplyAdd intrinsic
HW_Flag_FmaIntrinsic = HW_TARGET_FLAG(13),

// The intrinsic is a PermuteVar2x intrinsic
HW_Flag_PermuteVar2x = HW_TARGET_FLAG(14),

// The intrinsic is an embedded broadcast compatible intrinsic
HW_Flag_EmbBroadcastCompatible = HW_TARGET_FLAG(15),

// The intrinsic is an embedded rounding compatible intrinsic
HW_Flag_EmbRoundingCompatible = HW_TARGET_FLAG(16),

// The intrinsic is an embedded masking incompatible intrinsic
HW_Flag_EmbMaskingIncompatible = HW_TARGET_FLAG(17),

#elif defined(TARGET_ARM64)
// The intrinsic has an immediate operand
// - the value can be (and should be) encoded in a corresponding instruction when the operand value is constant
HW_Flag_HasImmediateOperand = 0x400,
HW_Flag_HasImmediateOperand = HW_TARGET_FLAG(1),

// The intrinsic has read/modify/write semantics in multiple-operands form.
HW_Flag_HasRMWSemantics = 0x800,
HW_Flag_HasRMWSemantics = HW_TARGET_FLAG(2),

// The intrinsic operates on the lower part of a SIMD register
// - the upper part of the source registers are ignored
// - the upper part of the destination register is zeroed
HW_Flag_SIMDScalar = 0x1000,
HW_Flag_SIMDScalar = HW_TARGET_FLAG(3),

// The intrinsic supports some sort of containment analysis
HW_Flag_SupportsContainment = 0x2000,
HW_Flag_SupportsContainment = HW_TARGET_FLAG(4),

// The intrinsic needs consecutive registers
HW_Flag_NeedsConsecutiveRegisters = 0x4000,
HW_Flag_NeedsConsecutiveRegisters = HW_TARGET_FLAG(5),

// The intrinsic uses scalable registers
HW_Flag_Scalable = 0x8000,
HW_Flag_Scalable = HW_TARGET_FLAG(6),

// Returns Per-Element Mask
// the intrinsic returns a vector containing elements that are either "all bits set" or "all bits clear"
// this output can be used as a per-element mask
HW_Flag_ReturnsPerElementMask = 0x10000,
HW_Flag_ReturnsPerElementMask = HW_TARGET_FLAG(7),

// The intrinsic uses a mask in arg1 to select elements present in the result
HW_Flag_ExplicitMaskedOperation = 0x20000,
HW_Flag_ExplicitMaskedOperation = HW_TARGET_FLAG(8),

// The intrinsic uses a mask in arg1 to select elements present in the result, and must use a low register.
HW_Flag_LowMaskedOperation = 0x40000,
HW_Flag_LowMaskedOperation = HW_TARGET_FLAG(9),

// The intrinsic can optionally use a mask in arg1 to select elements present in the result, which is not present in
// the API call
HW_Flag_OptionalEmbeddedMaskedOperation = 0x80000,
HW_Flag_OptionalEmbeddedMaskedOperation = HW_TARGET_FLAG(10),

// The intrinsic uses a mask in arg1 to select elements present in the result, which is not present in the API call
HW_Flag_EmbeddedMaskedOperation = 0x100000,
HW_Flag_EmbeddedMaskedOperation = HW_TARGET_FLAG(11),

// The intrinsic has an enum operand. Using this implies HW_Flag_HasImmediateOperand.
HW_Flag_HasEnumOperand = HW_TARGET_FLAG(12),

#else
#error Unsupported platform
#endif

// The intrinsic has some barrier special side effect that should be tracked
HW_Flag_SpecialSideEffect_Barrier = 0x200000,

// The intrinsic has some other special side effect that should be tracked
HW_Flag_SpecialSideEffect_Other = 0x400000,

HW_Flag_SpecialSideEffectMask = (HW_Flag_SpecialSideEffect_Barrier | HW_Flag_SpecialSideEffect_Other),

// MaybeNoJmpTable IMM
// the imm intrinsic may not need jumptable fallback when it gets non-const argument
HW_Flag_MaybeNoJmpTableIMM = 0x800000,

#if defined(TARGET_XARCH)
// The intrinsic is an RMW intrinsic
HW_Flag_RmwIntrinsic = 0x1000000,

// The intrinsic is a FusedMultiplyAdd intrinsic
HW_Flag_FmaIntrinsic = 0x2000000,

// The intrinsic is a PermuteVar2x intrinsic
HW_Flag_PermuteVar2x = 0x4000000,

// The intrinsic is an embedded broadcast compatible intrinsic
HW_Flag_EmbBroadcastCompatible = 0x8000000,

// The intrinsic is an embedded rounding compatible intrinsic
HW_Flag_EmbRoundingCompatible = 0x10000000,

// The intrinsic is an embedded masking incompatible intrinsic
HW_Flag_EmbMaskingIncompatible = 0x20000000,
#elif defined(TARGET_ARM64)

// The intrinsic has an enum operand. Using this implies HW_Flag_HasImmediateOperand.
HW_Flag_HasEnumOperand = 0x1000000,

#endif // TARGET_XARCH

HW_Flag_CanBenefitFromConstantProp = 0x80000000,
};

#if defined(TARGET_XARCH)
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/hwintrinsiclistarm64sve.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
// Sve
HARDWARE_INTRINSIC(Sve, Abs, -1, -1, false, {INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_fabs, INS_sve_fabs}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation)
HARDWARE_INTRINSIC(Sve, Add, -1, -1, false, {INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_add, INS_sve_fadd, INS_sve_fadd}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, AddAcross, -1, 1, true, {INS_sve_saddv, INS_sve_uaddv, INS_sve_saddv, INS_sve_uaddv, INS_sve_saddv, INS_sve_uaddv, INS_sve_uaddv, INS_sve_uaddv, INS_sve_faddv, INS_sve_faddv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation)
HARDWARE_INTRINSIC(Sve, ConditionalSelect, -1, 3, true, {INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_SupportsContainment)
HARDWARE_INTRINSIC(Sve, Count16BitElements, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_cnth, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_Scalable|HW_Flag_HasEnumOperand|HW_Flag_SpecialCodeGen|HW_Flag_NoFloatingPointUsed)
HARDWARE_INTRINSIC(Sve, Count32BitElements, 0, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_cntw, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_Scalable|HW_Flag_HasEnumOperand|HW_Flag_SpecialCodeGen|HW_Flag_NoFloatingPointUsed)
Expand Down
Loading

0 comments on commit a3d0161

Please sign in to comment.