dotnet · kunalspathak · May 11, 2024 · May 6, 2024 · May 7, 2024 · May 7, 2024
diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp
@@ -277,6 +277,8 @@ void HWIntrinsicInfo::lookupImmBounds(
             case NI_AdvSimd_Arm64_StoreSelectedScalarVector128x4:
             case NI_AdvSimd_Arm64_DuplicateSelectedScalarToVector128:
             case NI_AdvSimd_Arm64_InsertSelectedScalar:
+            case NI_Sve_FusedMultiplyAddBySelectedScalar:
+            case NI_Sve_FusedMultiplySubtractBySelectedScalar:
                 immUpperBound = Compiler::getSIMDVectorLength(simdSize, baseType) - 1;
                 break;
 

diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp
@@ -417,10 +417,16 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
             regNumber maskReg       = op1Reg;
             regNumber embMaskOp1Reg = REG_NA;
             regNumber embMaskOp2Reg = REG_NA;
+            regNumber embMaskOp3Reg = REG_NA;
             regNumber falseReg      = op3Reg;
 
             switch (intrinEmbMask.numOperands)
             {
+                case 3:
+                    assert(intrinEmbMask.op3 != nullptr);
+                    embMaskOp3Reg = intrinEmbMask.op3->GetRegNum();
+                    FALLTHROUGH;
+
                 case 2:
                     assert(intrinEmbMask.op2 != nullptr);
                     embMaskOp2Reg = intrinEmbMask.op2->GetRegNum();
@@ -438,6 +444,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
             switch (intrinEmbMask.numOperands)
             {
                 case 1:
+                {
                     assert(!instrIsRMW);
 
                     if (targetReg != falseReg)
@@ -488,9 +495,10 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
 
                     GetEmitter()->emitIns_R_R_R(insEmbMask, emitSize, targetReg, maskReg, embMaskOp1Reg, opt);
                     break;
+                }
 
                 case 2:
-
+                {
                     assert(instrIsRMW);
 
                     if (intrin.op3->IsVectorZero())
@@ -560,7 +568,50 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
                     }
 
                     break;
+                }
+                case 3:
+                {
+                    assert(instrIsRMW);
+                    assert(targetReg != falseReg);
+                    assert(targetReg != embMaskOp2Reg);
+                    assert(targetReg != embMaskOp3Reg);
+                    assert(!HWIntrinsicInfo::IsOptionalEmbeddedMaskedOperation(intrinEmbMask.id));
+
+                    if (intrin.op3->IsVectorZero())
+                    {
+                        // If `falseReg` is zero, then move the first operand of `intrinEmbMask` in the
+                        // destination using /Z.
+
+                        assert(targetReg != embMaskOp2Reg);
+                        GetEmitter()->emitIns_R_R_R(INS_sve_movprfx, emitSize, targetReg, maskReg, embMaskOp1Reg, opt);
 
+                        // Finally, perform the actual "predicated" operation so that `targetReg` is the first operand
+                        // `embMaskOp2Reg` is the second operand and `embMaskOp3Reg` is the third operand.
+                        GetEmitter()->emitIns_R_R_R_R(insEmbMask, emitSize, targetReg, maskReg, embMaskOp2Reg,
+                                                      embMaskOp3Reg, opt);
+                    }
+                    else
+                    {
+                        // If the instruction just has "predicated" version, then move the "embMaskOp1Reg"
+                        // into targetReg. Next, do the predicated operation on the targetReg and last,
+                        // use "sel" to select the active lanes based on mask, and set inactive lanes
+                        // to falseReg.
+
+                        assert(HWIntrinsicInfo::IsEmbeddedMaskedOperation(intrinEmbMask.id));
+
+                        if (targetReg != embMaskOp1Reg)
+                        {
+                            GetEmitter()->emitIns_R_R(INS_sve_movprfx, EA_SCALABLE, targetReg, embMaskOp1Reg);
+                        }
+
+                        GetEmitter()->emitIns_R_R_R_R(insEmbMask, emitSize, targetReg, maskReg, embMaskOp2Reg,
+                                                      embMaskOp3Reg, opt);
+
+                        GetEmitter()->emitIns_R_R_R_R(INS_sve_sel, emitSize, targetReg, maskReg, targetReg, falseReg,
+                                                      opt, INS_SCALABLE_OPTS_UNPREDICATED);
+                    }
+                    break;
+                }
                 default:
                     unreached();
             }
@@ -627,6 +678,12 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
                                                       INS_SCALABLE_OPTS_UNPREDICATED);
                     }
                     break;
+                case 4:
+                    assert(!isRMW);
+                    GetEmitter()->emitIns_R_R_R_R(ins, emitSize, targetReg, op1Reg, op2Reg, op3Reg, opt,
+                                                  INS_SCALABLE_OPTS_UNPREDICATED);
+                    break;
+
                 default:
                     unreached();
             }

diff --git a/src/coreclr/jit/hwintrinsiclistarm64sve.h b/src/coreclr/jit/hwintrinsiclistarm64sve.h
@@ -46,6 +46,12 @@ HARDWARE_INTRINSIC(Sve,           CreateWhileLessThanOrEqualMask32Bit,
 HARDWARE_INTRINSIC(Sve,           CreateWhileLessThanOrEqualMask64Bit,                              -1,      2,      false, {INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_sve_whilele,    INS_invalid,        INS_invalid},     HW_Category_SIMD,                  HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask)
 HARDWARE_INTRINSIC(Sve,           CreateWhileLessThanOrEqualMask8Bit,                               -1,      2,      false, {INS_invalid,        INS_sve_whilele,    INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},     HW_Category_SIMD,                  HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask)
 HARDWARE_INTRINSIC(Sve,           Divide,                                                           -1,      2,      true,  {INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_sve_sdiv,       INS_sve_udiv,       INS_sve_sdiv,       INS_sve_udiv,       INS_sve_fdiv,       INS_sve_fdiv},    HW_Category_SIMD,                  HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
+HARDWARE_INTRINSIC(Sve,           FusedMultiplyAdd,                                                 -1,     -1,      false, {INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_sve_fmla,       INS_sve_fmla},    HW_Category_SIMD,                  HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
+HARDWARE_INTRINSIC(Sve,           FusedMultiplyAddBySelectedScalar,                                 -1,      4,      true,  {INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_sve_fmla,       INS_sve_fmla},    HW_Category_SIMDByIndexedElement,  HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_HasRMWSemantics)
+HARDWARE_INTRINSIC(Sve,           FusedMultiplyAddNegated,                                          -1,     -1,      false, {INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_sve_fnmla,      INS_sve_fnmla},   HW_Category_SIMD,                  HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
+HARDWARE_INTRINSIC(Sve,           FusedMultiplySubtract,                                            -1,     -1,      false, {INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_sve_fmls,       INS_sve_fmls},    HW_Category_SIMD,                  HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
+HARDWARE_INTRINSIC(Sve,           FusedMultiplySubtractBySelectedScalar,                            -1,      4,      true,  {INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_sve_fmls,       INS_sve_fmls},    HW_Category_SIMDByIndexedElement,  HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_HasRMWSemantics)
+HARDWARE_INTRINSIC(Sve,           FusedMultiplySubtractNegated,                                     -1,     -1,      false, {INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_sve_fnmls,      INS_sve_fnmls},   HW_Category_SIMD,                  HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
 HARDWARE_INTRINSIC(Sve,           LoadVector,                                                       -1,      2,      true,  {INS_sve_ld1b,       INS_sve_ld1b,       INS_sve_ld1h,       INS_sve_ld1h,       INS_sve_ld1w,       INS_sve_ld1w,       INS_sve_ld1d,       INS_sve_ld1d,       INS_sve_ld1w,       INS_sve_ld1d},    HW_Category_MemoryLoad,            HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
 HARDWARE_INTRINSIC(Sve,           LoadVectorByteZeroExtendToInt16,                                  -1,      2,      false, {INS_invalid,        INS_invalid,        INS_sve_ld1b,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},     HW_Category_MemoryLoad,            HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
 HARDWARE_INTRINSIC(Sve,           LoadVectorByteZeroExtendToInt32,                                  -1,      2,      false, {INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_sve_ld1b,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},     HW_Category_MemoryLoad,            HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)

diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp
@@ -3352,6 +3352,16 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
                 break;
             }
 
+            case NI_Sve_FusedMultiplyAddBySelectedScalar:
+            case NI_Sve_FusedMultiplySubtractBySelectedScalar:
+                assert(hasImmediateOperand);
+                assert(varTypeIsIntegral(intrin.op4));
+                if (intrin.op4->IsCnsIntOrI())
+                {
+                    MakeSrcContained(node, intrin.op4);
+                }
+                break;
+
             default:
                 unreached();
         }

diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp
@@ -1772,7 +1772,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
         // then record delay-free for operands as well as the "merge" value
         GenTreeHWIntrinsic* intrinEmbOp2 = intrin.op2->AsHWIntrinsic();
         size_t              numArgs      = intrinEmbOp2->GetOperandCount();
-        assert((numArgs == 1) || (numArgs == 2));
+        assert((numArgs == 1) || (numArgs == 2) || (numArgs == 3));
         tgtPrefUse = BuildUse(intrinEmbOp2->Op(1));
         srcCount += 1;
 
@@ -1792,7 +1792,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
 
         assert(intrin.op1 != nullptr);
 
-        bool forceOp2DelayFree = false;
+        bool      forceOp2DelayFree = false;
+        regMaskTP candidates        = RBM_NONE;
         if ((intrin.id == NI_Vector64_GetElement) || (intrin.id == NI_Vector128_GetElement))
         {
             if (!intrin.op2->IsCnsIntOrI() && (!intrin.op1->isContained() || intrin.op1->OperIsLocal()))
@@ -1815,6 +1816,22 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
             }
         }
 
+        if ((intrin.id == NI_Sve_FusedMultiplyAddBySelectedScalar) ||
 if ((intrin.category == HW_Category_SIMDByIndexedElement) && (genTypeSize(intrin.baseType) == 2)) 
 { 
     // Some "Advanced SIMD scalar x indexed element" and "Advanced SIMD vector x indexed element" instructions (e.g. 
     // "MLA (by element)") have encoding that restricts what registers that can be used for the indexed element when 
     // the element size is H (i.e. 2 bytes). 
     assert(intrin.op2 != nullptr); 
     if ((intrin.op4 != nullptr) || ((intrin.op3 != nullptr) && !hasImmediateOperand)) 
     { 
         if (isRMW) 
         { 
             srcCount += BuildDelayFreeUses(intrin.op2, nullptr); 
             srcCount += BuildDelayFreeUses(intrin.op3, nullptr, RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS); 
         } 
         else 
         { 
             srcCount += BuildOperandUses(intrin.op2); 
             srcCount += BuildOperandUses(intrin.op3, RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS); 
         } 
         if (intrin.op4 != nullptr) 
         { 
             assert(hasImmediateOperand); 
             assert(varTypeIsIntegral(intrin.op4)); 
             srcCount += BuildOperandUses(intrin.op4); 
         } 
     } 
 if ((intrin.category == HW_Category_SIMDByIndexedElement) && (genTypeSize(intrin.baseType) == 2)) 
 { 
     // Some "Advanced SIMD scalar x indexed element" and "Advanced SIMD vector x indexed element" instructions (e.g. 
     // "MLA (by element)") have encoding that restricts what registers that can be used for the indexed element when 
     // the element size is H (i.e. 2 bytes). 
     assert(intrin.op2 != nullptr); 
  
     if ((intrin.op4 != nullptr) || ((intrin.op3 != nullptr) && !hasImmediateOperand)) 
     { 
         if (isRMW) 
         { 
             srcCount += BuildDelayFreeUses(intrin.op2, nullptr); 
             srcCount += BuildDelayFreeUses(intrin.op3, nullptr, RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS); 
         } 
         else 
         { 
             srcCount += BuildOperandUses(intrin.op2); 
             srcCount += BuildOperandUses(intrin.op3, RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS); 
         } 
  
         if (intrin.op4 != nullptr) 
         { 
             assert(hasImmediateOperand); 
             assert(varTypeIsIntegral(intrin.op4)); 
  
             srcCount += BuildOperandUses(intrin.op4); 
         } 
     } 
+            (intrin.id == NI_Sve_FusedMultiplySubtractBySelectedScalar))
+        {
+            // If this is common pattern, then we will add a flag in the table, but for now, just check for specific
+            // intrinsics
+            if (intrin.baseType == TYP_DOUBLE)
+            {
+                candidates = RBM_SVE_INDEXED_D_ELEMENT_ALLOWED_REGS;
+            }
+            else
+            {
+                assert(intrin.baseType == TYP_FLOAT);
+                candidates = RBM_SVE_INDEXED_S_ELEMENT_ALLOWED_REGS;
+            }
+        }
+
         if ((intrin.id == NI_Sve_ConditionalSelect) && (intrin.op2->IsEmbMaskOp()) &&
             (intrin.op2->isRMWHWIntrinsic(compiler)))
         {
@@ -1845,7 +1862,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
 
         if (intrin.op3 != nullptr)
         {
-            srcCount += isRMW ? BuildDelayFreeUses(intrin.op3, intrin.op1) : BuildOperandUses(intrin.op3);
+            srcCount += isRMW ? BuildDelayFreeUses(intrin.op3, intrin.op1, candidates)
+                              : BuildOperandUses(intrin.op3, candidates);
 
             if (intrin.op4 != nullptr)
             {

diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
@@ -10717,10 +10717,9 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node)
                 break;
             }
 
-            unsigned            simdBaseTypeSize = genTypeSize(node->GetSimdBaseType());
-            GenTreeHWIntrinsic* cvtOp2           = op2->AsHWIntrinsic();
+            unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType());
 
-            if ((genTypeSize(cvtOp2->GetSimdBaseType()) != simdBaseTypeSize))
+            if (!op2->OperIsHWIntrinsic() || (genTypeSize(op2->AsHWIntrinsic()->GetSimdBaseType()) != simdBaseTypeSize))
             {
                 // We need the operand to be the same kind of mask; otherwise
                 // the bitwise operation can differ in how it performs

diff --git a/src/coreclr/jit/targetarm64.h b/src/coreclr/jit/targetarm64.h
@@ -376,9 +376,11 @@
   // For arm64, this is the maximum prolog establishment pre-indexed (that is SP pre-decrement) offset.
   #define STACK_PROBE_BOUNDARY_THRESHOLD_BYTES 512
 
-  // Some "Advanced SIMD scalar x indexed element" and "Advanced SIMD vector x indexed element" instructions (e.g. "MLA (by element)")
+  // Some "Advanced SIMD / SVE scalar x indexed element" and "Advanced SIMD / SVE vector x indexed element" instructions (e.g. "MLA (by element)")
   // have encoding that restricts what registers that can be used for the indexed element when the element size is H (i.e. 2 bytes).
   #define RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS (RBM_V0|RBM_V1|RBM_V2|RBM_V3|RBM_V4|RBM_V5|RBM_V6|RBM_V7|RBM_V8|RBM_V9|RBM_V10|RBM_V11|RBM_V12|RBM_V13|RBM_V14|RBM_V15)
+  #define RBM_SVE_INDEXED_S_ELEMENT_ALLOWED_REGS (RBM_V0|RBM_V1|RBM_V2|RBM_V3|RBM_V4|RBM_V5|RBM_V6|RBM_V7)
+  #define RBM_SVE_INDEXED_D_ELEMENT_ALLOWED_REGS RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS
 
   #define REG_ZERO_INIT_FRAME_REG1 REG_R9
   #define REG_ZERO_INIT_FRAME_REG2 REG_R10