Skip to content

Commit

Permalink
Merge pull request #2301 from lioncash/insps
Browse files Browse the repository at this point in the history
OpcodeDispatcher: Handle VINSERTPS
  • Loading branch information
Sonicadvance1 authored Dec 31, 2022
2 parents faa81f2 + 8c005db commit 6caf764
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 20 deletions.
3 changes: 3 additions & 0 deletions External/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6077,6 +6077,9 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() {
{OPD(3, 0b01, 0x17), 1, &OpDispatchBuilder::PExtrOp<4>},

{OPD(3, 0b01, 0x18), 1, &OpDispatchBuilder::VINSERTOp},

{OPD(3, 0b01, 0x21), 1, &OpDispatchBuilder::VINSERTPSOp},

{OPD(3, 0b01, 0x38), 1, &OpDispatchBuilder::VINSERTOp},

{OPD(3, 0b01, 0x46), 1, &OpDispatchBuilder::VPERM2Op},
Expand Down
5 changes: 5 additions & 0 deletions External/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,7 @@ enum class SelectionFlag {
void VHADDPOp(OpcodeArgs);

void VINSERTOp(OpcodeArgs);
void VINSERTPSOp(OpcodeArgs);

void VMOVAPS_VMOVAPD_Op(OpcodeArgs);
void VMOVUPS_VMOVUPD_Op(OpcodeArgs);
Expand Down Expand Up @@ -753,6 +754,10 @@ enum class SelectionFlag {
OrderedNode* ExtendVectorElementsImpl(OpcodeArgs, size_t ElementSize,
size_t DstElementSize, bool Signed);

OrderedNode* InsertPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1,
const X86Tables::DecodedOperand& Src2,
const X86Tables::DecodedOperand& Imm);

OrderedNode* PACKSSOpImpl(OpcodeArgs, size_t ElementSize,
OrderedNode *Src1, OrderedNode *Src2);

Expand Down
54 changes: 35 additions & 19 deletions External/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1184,48 +1184,64 @@ void OpDispatchBuilder::PINSROp<4>(OpcodeArgs);
template
void OpDispatchBuilder::PINSROp<8>(OpcodeArgs);

void OpDispatchBuilder::InsertPSOp(OpcodeArgs) {
LOGMAN_THROW_A_FMT(Op->Src[1].IsLiteral(), "Src1 needs to be literal here");
uint8_t Imm = Op->Src[1].Data.Literal.Value;
uint8_t CountS = (Imm >> 6);
uint8_t CountD = (Imm >> 4) & 0b11;
uint8_t ZMask = Imm & 0xF;
OrderedNode* OpDispatchBuilder::InsertPSOpImpl(OpcodeArgs, const X86Tables::DecodedOperand& Src1,
const X86Tables::DecodedOperand& Src2,
const X86Tables::DecodedOperand& Imm) {
LOGMAN_THROW_A_FMT(Imm.IsLiteral(), "Imm needs to be literal here");
const uint8_t ImmValue = Imm.Data.Literal.Value;
uint8_t CountS = (ImmValue >> 6);
uint8_t CountD = (ImmValue >> 4) & 0b11;
const uint8_t ZMask = ImmValue & 0xF;

const auto DstSize = GetDstSize(Op);

OrderedNode *Dest{};
if (ZMask != 0xF) {
// Only need to load destination if it isn't a full zero
Dest = LoadSource_WithOpSize(FPRClass, Op, Op->Dest, GetDstSize(Op), Op->Flags, -1);
Dest = LoadSource_WithOpSize(FPRClass, Op, Src1, DstSize, Op->Flags, -1);
}

if (!(ZMask & (1 << CountD))) {
if ((ZMask & (1 << CountD)) == 0) {
// In the case that ZMask overwrites the destination element, then don't even insert
OrderedNode *Src{};
if (Op->Src[0].IsGPR()) {
Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, -1);
}
else {
if (Src2.IsGPR()) {
Src = LoadSource(FPRClass, Op, Src2, Op->Flags, -1);
} else {
// If loading from memory then CountS is forced to zero
CountS = 0;
Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], 4, Op->Flags, -1);
Src = LoadSource_WithOpSize(FPRClass, Op, Src2, 4, Op->Flags, -1);
}

Dest = _VInsElement(GetDstSize(Op), 4, CountD, CountS, Dest, Src);
Dest = _VInsElement(DstSize, 4, CountD, CountS, Dest, Src);
}

// ZMask happens after insert
if (ZMask == 0xF) {
Dest = _VectorImm(16, 4, 0);
return _VectorImm(16, 4, 0);
}
else if (ZMask) {

if (ZMask) {
auto Zero = _VectorImm(16, 4, 0);
for (size_t i = 0; i < 4; ++i) {
if (ZMask & (1 << i)) {
Dest = _VInsElement(GetDstSize(Op), 4, i, 0, Dest, Zero);
if ((ZMask & (1 << i)) != 0) {
Dest = _VInsElement(DstSize, 4, i, 0, Dest, Zero);
}
}
}

StoreResult(FPRClass, Op, Dest, -1);
return Dest;
}

void OpDispatchBuilder::InsertPSOp(OpcodeArgs) {
OrderedNode *Result = InsertPSOpImpl(Op, Op->Dest, Op->Src[0], Op->Src[1]);
StoreResult(FPRClass, Op, Result, -1);
}

void OpDispatchBuilder::VINSERTPSOp(OpcodeArgs) {
OrderedNode *Insert = InsertPSOpImpl(Op, Op->Src[0], Op->Src[1], Op->Src[2]);
OrderedNode *Result = _VMov(16, Insert);

StoreResult(FPRClass, Op, Result, -1);
}

template<size_t ElementSize>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,7 @@ void InitializeVEXTables() {
{OPD(3, 0b01, 0x1D), 1, X86InstInfo{"VCVTPS2PH", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}},

{OPD(3, 0b01, 0x20), 1, X86InstInfo{"VPINSRB", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}},
{OPD(3, 0b01, 0x21), 1, X86InstInfo{"VINSERTPS", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}},
{OPD(3, 0b01, 0x21), 1, X86InstInfo{"VINSERTPS", TYPE_INST, GenFlagsSameSize(SIZE_128BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1, nullptr}},
{OPD(3, 0b01, 0x22), 1, X86InstInfo{"VPINSRD", TYPE_UNDEC, FLAGS_NONE, 0, nullptr}},

{OPD(3, 0b01, 0x38), 1, X86InstInfo{"VINSERTI128", TYPE_INST, GenFlagsSameSize(SIZE_256BIT) | FLAGS_MODRM | FLAGS_VEX_1ST_SRC | FLAGS_XMM_FLAGS, 1, nullptr}},
Expand Down
52 changes: 52 additions & 0 deletions unittests/ASM/VEX/vinsertps.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
%ifdef CONFIG
{
"HostFeatures": ["AVX"],
"RegData": {
"XMM2": ["0x4142434465666768", "0x5152535455565758", "0x0000000000000000", "0x0000000000000000"],
"XMM3": ["0x4142434461626364", "0x5152535455565758", "0x0000000000000000", "0x0000000000000000"],
"XMM4": ["0x7576777845464748", "0x5152535455565758", "0x0000000000000000", "0x0000000000000000"],
"XMM5": ["0x4142434445464748", "0x5152535471727374", "0x0000000000000000", "0x0000000000000000"],
"XMM6": ["0x4142434445464748", "0x7576777855565758", "0x0000000000000000", "0x0000000000000000"],
"XMM7": ["0x4142434445464748", "0x5152535475767778", "0x0000000000000000", "0x0000000000000000"],
"XMM8": ["0x7576777845464748", "0x5152535455565758", "0x0000000000000000", "0x0000000000000000"],
"XMM9": ["0x4142434475767778", "0x5152535455565758", "0x0000000000000000", "0x0000000000000000"],
"XMM10": ["0x0000000065666768", "0x5152535455565758", "0x0000000000000000", "0x0000000000000000"],
"XMM11": ["0x0000000061626364", "0x5152535455565758", "0x0000000000000000", "0x0000000000000000"],
"XMM12": ["0x0000000000000000", "0x0000000000000000", "0x0000000000000000", "0x0000000000000000"]
}
}
%endif

lea rdx, [rel .data]

vmovapd xmm0, [rdx]
vmovapd xmm1, [rdx + 16]

; Simple move Reg<-Reg
vinsertps xmm2, xmm0, xmm1, ((0b00 << 6) | (0b00 << 4) | (0b0000))
vinsertps xmm3, xmm0, xmm1, ((0b01 << 6) | (0b00 << 4) | (0b0000))
vinsertps xmm4, xmm0, xmm1, ((0b10 << 6) | (0b01 << 4) | (0b0000))
vinsertps xmm5, xmm0, xmm1, ((0b11 << 6) | (0b10 << 4) | (0b0000))

; Simple move Reg<-Mem
vinsertps xmm6, xmm0, [rdx + 8 * 3], ((0b00 << 6) | (0b11 << 4) | (0b0000))
vinsertps xmm7, xmm0, [rdx + 8 * 3], ((0b01 << 6) | (0b10 << 4) | (0b0000))
vinsertps xmm8, xmm0, [rdx + 8 * 3], ((0b10 << 6) | (0b01 << 4) | (0b0000))
vinsertps xmm9, xmm0, [rdx + 8 * 3], ((0b11 << 6) | (0b00 << 4) | (0b0000))

; Simple move Reg<-Reg with mask
vinsertps xmm10, xmm0, xmm1, ((0b00 << 6) | (0b00 << 4) | (0b0010))
vinsertps xmm11, xmm0, xmm1, ((0b01 << 6) | (0b00 << 4) | (0b0010))

; Full ZMask
vinsertps xmm12, xmm0, xmm1, ((0b00 << 6) | (0b00 << 4) | (0b1111))

hlt

align 32
.data:
dq 0x4142434445464748
dq 0x5152535455565758

dq 0x6162636465666768
dq 0x7172737475767778

0 comments on commit 6caf764

Please sign in to comment.