Skip to content

Commit

Permalink
Merge pull request #3843 from Sonicadvance1/remove_half_moves_fma3
Browse files Browse the repository at this point in the history
Arm64: Remove one move if possible in FMA operations
  • Loading branch information
Sonicadvance1 authored Jul 9, 2024
2 parents 7ccb252 + 80de890 commit 3bea08d
Show file tree
Hide file tree
Showing 4 changed files with 344 additions and 372 deletions.
108 changes: 84 additions & 24 deletions FEXCore/Source/Interface/Core/JIT/Arm64/VectorOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4047,12 +4047,16 @@ DEF_OP(VFMLA) {
const auto Mask = PRED_TMP_32B.Merging();
ARMEmitter::VRegister DestTmp = Dst;
if (Dst != VectorAddend) {
DestTmp = VTMP1;
if (Dst != Vector1 && Dst != Vector2) {
DestTmp = Dst;
} else {
DestTmp = VTMP1;
}
mov(DestTmp.Z(), VectorAddend.Z());
}

fmla(SubRegSize, DestTmp.Z(), Mask, Vector1.Z(), Vector2.Z());
if (Dst != VectorAddend) {
if (Dst != DestTmp) {
mov(Dst.Z(), DestTmp.Z());
}
} else {
Expand All @@ -4068,7 +4072,11 @@ DEF_OP(VFMLA) {
}
ARMEmitter::VRegister DestTmp = Dst;
if (Dst != VectorAddend) {
DestTmp = VTMP1;
if (Dst != Vector1 && Dst != Vector2) {
DestTmp = Dst;
} else {
DestTmp = VTMP1;
}
mov(DestTmp.Q(), VectorAddend.Q());
}
if (OpSize == 16) {
Expand All @@ -4077,7 +4085,7 @@ DEF_OP(VFMLA) {
fmla(SubRegSize, DestTmp.D(), Vector1.D(), Vector2.D());
}

if (Dst != VectorAddend) {
if (Dst != DestTmp) {
mov(Dst.Q(), DestTmp.Q());
}
}
Expand Down Expand Up @@ -4105,24 +4113,32 @@ DEF_OP(VFMLS) {
const auto Mask = PRED_TMP_32B.Merging();
ARMEmitter::VRegister DestTmp = Dst;
if (Dst != VectorAddend) {
DestTmp = VTMP1;
if (Dst != Vector1 && Dst != Vector2) {
DestTmp = Dst;
} else {
DestTmp = VTMP1;
}
mov(DestTmp.Z(), VectorAddend.Z());
}

fnmls(SubRegSize, DestTmp.Z(), Mask, Vector1.Z(), Vector2.Z());
if (Dst != VectorAddend) {
if (Dst != DestTmp) {
mov(Dst.Z(), DestTmp.Z());
}
} else if (HostSupportsSVE128 && Is128Bit) {
const auto Mask = PRED_TMP_16B.Merging();
ARMEmitter::VRegister DestTmp = Dst;
if (Dst != VectorAddend) {
DestTmp = VTMP1;
if (Dst != Vector1 && Dst != Vector2) {
DestTmp = Dst;
} else {
DestTmp = VTMP1;
}
mov(DestTmp.Z(), VectorAddend.Z());
}

fnmls(SubRegSize, DestTmp.Z(), Mask, Vector1.Z(), Vector2.Z());
if (Dst != VectorAddend) {
if (Dst != DestTmp) {
mov(Dst.Z(), DestTmp.Z());
}
} else {
Expand All @@ -4138,15 +4154,29 @@ DEF_OP(VFMLS) {
}

// Addend needs to get negated to match correct behaviour here.
ARMEmitter::VRegister DestTmp = VTMP1;
ARMEmitter::VRegister DestTmp = Dst;
if (Dst == Vector1 || Dst == Vector2) {
DestTmp = VTMP1;
}

if (Is128Bit) {
fneg(SubRegSize, DestTmp.Q(), VectorAddend.Q());
fmla(SubRegSize, DestTmp.Q(), Vector1.Q(), Vector2.Q());
mov(Dst.Q(), DestTmp.Q());
} else {
fneg(SubRegSize, DestTmp.D(), VectorAddend.D());
}

if (Is128Bit) {
fmla(SubRegSize, DestTmp.Q(), Vector1.Q(), Vector2.Q());
} else {
fmla(SubRegSize, DestTmp.D(), Vector1.D(), Vector2.D());
mov(Dst.D(), DestTmp.D());
}

if (DestTmp != Dst) {
if (Is128Bit) {
mov(Dst.Q(), DestTmp.Q());
} else {
mov(Dst.D(), DestTmp.D());
}
}
}
}
Expand All @@ -4172,12 +4202,16 @@ DEF_OP(VFNMLA) {
const auto Mask = PRED_TMP_32B.Merging();
ARMEmitter::VRegister DestTmp = Dst;
if (Dst != VectorAddend) {
DestTmp = VTMP1;
if (Dst != Vector1 && Dst != Vector2) {
DestTmp = Dst;
} else {
DestTmp = VTMP1;
}
mov(DestTmp.Z(), VectorAddend.Z());
}

fmls(SubRegSize, DestTmp.Z(), Mask, Vector1.Z(), Vector2.Z());
if (Dst != VectorAddend) {
if (Dst != DestTmp) {
mov(Dst.Z(), DestTmp.Z());
}
} else {
Expand All @@ -4194,7 +4228,11 @@ DEF_OP(VFNMLA) {

ARMEmitter::VRegister DestTmp = Dst;
if (Dst != VectorAddend) {
DestTmp = VTMP1;
if (Dst != Vector1 && Dst != Vector2) {
DestTmp = Dst;
} else {
DestTmp = VTMP1;
}
mov(DestTmp.Q(), VectorAddend.Q());
}
if (OpSize == 16) {
Expand All @@ -4203,7 +4241,7 @@ DEF_OP(VFNMLA) {
fmls(SubRegSize, DestTmp.D(), Vector1.D(), Vector2.D());
}

if (Dst != VectorAddend) {
if (Dst != DestTmp) {
mov(Dst.Q(), DestTmp.Q());
}
}
Expand Down Expand Up @@ -4232,24 +4270,32 @@ DEF_OP(VFNMLS) {
const auto Mask = PRED_TMP_32B.Merging();
ARMEmitter::VRegister DestTmp = Dst;
if (Dst != VectorAddend) {
DestTmp = VTMP1;
if (Dst != Vector1 && Dst != Vector2) {
DestTmp = Dst;
} else {
DestTmp = VTMP1;
}
mov(DestTmp.Z(), VectorAddend.Z());
}

fnmla(SubRegSize, DestTmp.Z(), Mask, Vector1.Z(), Vector2.Z());
if (Dst != VectorAddend) {
if (Dst != DestTmp) {
mov(Dst.Z(), DestTmp.Z());
}
} else if (HostSupportsSVE128 && Is128Bit) {
const auto Mask = PRED_TMP_16B.Merging();
ARMEmitter::VRegister DestTmp = Dst;
if (Dst != VectorAddend) {
DestTmp = VTMP1;
if (Dst != Vector1 && Dst != Vector2) {
DestTmp = Dst;
} else {
DestTmp = VTMP1;
}
mov(DestTmp.Z(), VectorAddend.Z());
}

fnmla(SubRegSize, DestTmp.Z(), Mask, Vector1.Z(), Vector2.Z());
if (Dst != VectorAddend) {
if (Dst != DestTmp) {
mov(Dst.Z(), DestTmp.Z());
}
} else {
Expand All @@ -4265,15 +4311,29 @@ DEF_OP(VFNMLS) {
}

// Addend needs to get negated to match correct behaviour here.
ARMEmitter::VRegister DestTmp = VTMP1;
ARMEmitter::VRegister DestTmp = Dst;
if (Dst == Vector1 || Dst == Vector2) {
DestTmp = VTMP1;
}

if (Is128Bit) {
fneg(SubRegSize, DestTmp.Q(), VectorAddend.Q());
fmls(SubRegSize, DestTmp.Q(), Vector1.Q(), Vector2.Q());
mov(Dst.Q(), DestTmp.Q());
} else {
fneg(SubRegSize, DestTmp.D(), VectorAddend.D());
}

if (Is128Bit) {
fmls(SubRegSize, DestTmp.Q(), Vector1.Q(), Vector2.Q());
} else {
fmls(SubRegSize, DestTmp.D(), Vector1.D(), Vector2.D());
mov(Dst.D(), DestTmp.D());
}

if (DestTmp != Dst) {
if (Is128Bit) {
mov(Dst.Q(), DestTmp.Q());
} else {
mov(Dst.D(), DestTmp.D());
}
}
}
}
Expand Down
Loading

0 comments on commit 3bea08d

Please sign in to comment.