diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index c907fd95d036f..8fd35e92f6475 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -204,7 +204,7 @@ enum HWIntrinsicFlag : unsigned int // The intrinsic uses a mask in arg1 to select elements present in the result HW_Flag_ExplicitMaskedOperation = 0x20000, - // The intrinsic uses a mask in arg1 (either explicitly, embdedd or optionally embedded) to select elements present + // The intrinsic uses a mask in arg1 (either explicitly, embedded or optionally embedded) to select elements present // in the result, and must use a low register. HW_Flag_LowMaskedOperation = 0x40000, diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 79bf3c16778df..3118f81f3b29b 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1663,6 +1663,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou { assert(isRMW); assert(intrin.op1->OperIs(GT_FIELD_LIST)); + GenTreeFieldList* op1 = intrin.op1->AsFieldList(); assert(compiler->info.compNeedsConsecutiveRegisters); @@ -1724,7 +1725,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } } } - else if (intrinsicTree->OperIsMemoryLoadOrStore()) + else if ((intrinsicTree->OperIsMemoryLoadOrStore()) && (intrin.id != NI_AdvSimd_LoadAndInsertScalar)) { srcCount += BuildAddrUses(intrin.op1); } @@ -2151,7 +2152,11 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou { SingleTypeRegSet candidates = lowVectorOperandNum == 3 ? lowVectorCandidates : RBM_NONE; - if (isRMW) + if (intrin.id == NI_AdvSimd_LoadAndInsertScalar) + { + srcCount += BuildAddrUses(intrin.op3); + } + else if (isRMW) { srcCount += BuildDelayFreeUses(intrin.op3, (tgtPrefOp2 ? intrin.op2 : intrin.op1), candidates); } diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 3c390ee213941..368a5e82d53c7 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -3846,9 +3846,25 @@ int LinearScan::BuildDelayFreeUses(GenTree* node, return 0; } } + + // Don't mark as delay free if there is a mismatch in register types + bool addDelayFreeUses = false; + // Multi register nodes should not go via this route. + assert(!node->IsMultiRegNode()); + // Multi register nodes should always use fp registers (this includes vectors). + assert(varTypeUsesFloatReg(node->TypeGet()) || !node->IsMultiRegNode()); + if (rmwNode == nullptr || varTypeUsesSameRegType(rmwNode->TypeGet(), node->TypeGet()) || + (rmwNode->IsMultiRegNode() && varTypeUsesFloatReg(node->TypeGet()))) + { + addDelayFreeUses = true; + } + if (use != nullptr) { - AddDelayFreeUses(use, rmwNode); + if (addDelayFreeUses) + { + AddDelayFreeUses(use, rmwNode); + } if (useRefPositionRef != nullptr) { *useRefPositionRef = use; @@ -3864,15 +3880,20 @@ int LinearScan::BuildDelayFreeUses(GenTree* node, if (addrMode->HasBase() && !addrMode->Base()->isContained()) { use = BuildUse(addrMode->Base(), candidates); - AddDelayFreeUses(use, rmwNode); - + if (addDelayFreeUses) + { + AddDelayFreeUses(use, rmwNode); + } srcCount++; } + if (addrMode->HasIndex() && !addrMode->Index()->isContained()) { use = BuildUse(addrMode->Index(), candidates); - AddDelayFreeUses(use, rmwNode); - + if (addDelayFreeUses) + { + AddDelayFreeUses(use, rmwNode); + } srcCount++; } diff --git a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/LoadAndInsertScalarTest.template b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/LoadAndInsertScalarTest.template index 12c87a5134c74..4d51a82f999b9 100644 --- a/src/tests/JIT/HardwareIntrinsics/Arm/Shared/LoadAndInsertScalarTest.template +++ b/src/tests/JIT/HardwareIntrinsics/Arm/Shared/LoadAndInsertScalarTest.template @@ -45,6 +45,9 @@ namespace JIT.HardwareIntrinsics.Arm // Validates passing an instance member of a class works test.RunClassFldScenario(); + // Validates passing an non const value works + test.RunClassFldScenario_NotConstant(); + // Validates passing the field of a local struct works test.RunStructLclFldScenario(); @@ -150,6 +153,7 @@ namespace JIT.HardwareIntrinsics.Arm private static {Op1BaseType}[] _data1 = new {Op1BaseType}[Op1ElementCount]; private {Op1VectorType}<{Op1BaseType}> _fld1; + private byte _fld2; private {Op1BaseType} _fld3; private DataTable _dataTable; @@ -161,6 +165,7 @@ namespace JIT.HardwareIntrinsics.Arm for (var i = 0; i < Op1ElementCount; i++) { _data1[i] = {NextValueOp1}; } Unsafe.CopyBlockUnaligned(ref Unsafe.As<{Op1VectorType}<{Op1BaseType}>, byte>(ref _fld1), ref Unsafe.As<{Op1BaseType}, byte>(ref _data1[0]), (uint)Unsafe.SizeOf<{Op1VectorType}<{Op1BaseType}>>()); + _fld2 = {ElementIndex}; _fld3 = {NextValueOp3}; for (var i = 0; i < Op1ElementCount; i++) { _data1[i] = {NextValueOp1}; } @@ -247,6 +252,20 @@ namespace JIT.HardwareIntrinsics.Arm ValidateResult(_fld1, _fld3, _dataTable.outArrayPtr); } + public void RunClassFldScenario_NotConstant() + { + TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_NotConstant)); + + fixed ({Op1BaseType}* pFld3 = &_fld3) + { + var result = {Isa}.{Method}(_fld1, _fld2, pFld3); + + Unsafe.Write(_dataTable.outArrayPtr, result); + } + + ValidateResult(_fld1, _fld3, _dataTable.outArrayPtr); + } + public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario));