-
Notifications
You must be signed in to change notification settings - Fork 11.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[PowerPC] Support conversion between f16 and f128 #97677
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-powerpc Author: Esme (EsmeYi) ChangesThis patch enables conversion between f16 and f128, expanding on pre-Power9 targets and using HW instructions on Power9. Full diff: https://github.com/llvm/llvm-project/pull/97677.diff 3 Files Affected:
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 4d4008ac0ba70..360c463929b62 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -211,18 +211,24 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
}
if (Subtarget.isISA3_0()) {
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Legal);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Legal);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Legal);
+ setTruncStoreAction(MVT::f128, MVT::f16, Legal);
setTruncStoreAction(MVT::f64, MVT::f16, Legal);
setTruncStoreAction(MVT::f32, MVT::f16, Legal);
} else {
// No extending loads from f16 or HW conversions back and forth.
+ setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
+ setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand);
+ setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
}
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index dd07892794d59..51aa0be7439c6 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -3993,6 +3993,10 @@ defm : ScalToVecWPermute<
(SUBREG_TO_REG (i64 1), (VEXTSH2Ds (LXSIHZX ForceXForm:$src)), sub_64)>;
// Load/convert and convert/store patterns for f16.
+def : Pat<(f128 (extloadf16 ForceXForm:$src)),
+ (f128 (XSCVDPQP (XSCVHPDP (LXSIHZX ForceXForm:$src))))>;
+def : Pat<(truncstoref16 f128:$src, ForceXForm:$dst),
+ (STXSIHX (XSCVDPHP (XSCVQPDP $src)), ForceXForm:$dst)>;
def : Pat<(f64 (extloadf16 ForceXForm:$src)),
(f64 (XSCVHPDP (LXSIHZX ForceXForm:$src)))>;
def : Pat<(truncstoref16 f64:$src, ForceXForm:$dst),
@@ -4001,6 +4005,8 @@ def : Pat<(f32 (extloadf16 ForceXForm:$src)),
(f32 (COPY_TO_REGCLASS (XSCVHPDP (LXSIHZX ForceXForm:$src)), VSSRC))>;
def : Pat<(truncstoref16 f32:$src, ForceXForm:$dst),
(STXSIHX (XSCVDPHP (COPY_TO_REGCLASS $src, VSFRC)), ForceXForm:$dst)>;
+def : Pat<(f128 (f16_to_fp i32:$A)),
+ (f128 (XSCVDPQP (XSCVHPDP (MTVSRWZ $A))))>;
def : Pat<(f64 (f16_to_fp i32:$A)),
(f64 (XSCVHPDP (MTVSRWZ $A)))>;
def : Pat<(f32 (f16_to_fp i32:$A)),
@@ -4008,6 +4014,8 @@ def : Pat<(f32 (f16_to_fp i32:$A)),
def : Pat<(i32 (fp_to_f16 f32:$A)),
(i32 (MFVSRWZ (XSCVDPHP (COPY_TO_REGCLASS $A, VSFRC))))>;
def : Pat<(i32 (fp_to_f16 f64:$A)), (i32 (MFVSRWZ (XSCVDPHP $A)))>;
+def : Pat<(i32 (fp_to_f16 f128:$A)),
+ (i32 (MFVSRWZ (XSCVDPHP (XSCVQPDP $A))))>;
// Vector sign extensions
def : Pat<(f64 (PPCVexts f64:$A, 1)),
diff --git a/llvm/test/CodeGen/PowerPC/f16-to-from-f128.ll b/llvm/test/CodeGen/PowerPC/f16-to-from-f128.ll
new file mode 100644
index 0000000000000..4f1e7da09b820
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/f16-to-from-f128.ll
@@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown \
+; RUN: -verify-machineinstrs -ppc-asm-full-reg-names < %s | FileCheck %s \
+; RUN: --check-prefix=P8
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \
+; RUN: -verify-machineinstrs -ppc-asm-full-reg-names < %s | FileCheck %s
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown -mattr=-hard-float \
+; RUN: -verify-machineinstrs -ppc-asm-full-reg-names < %s | FileCheck %s \
+; RUN: --check-prefix=SOFT
+
+define half @trunc(fp128 %a) unnamed_addr {
+; P8-LABEL: trunc:
+; P8: # %bb.0: # %entry
+; P8-NEXT: mflr r0
+; P8-NEXT: stdu r1, -32(r1)
+; P8-NEXT: std r0, 48(r1)
+; P8-NEXT: .cfi_def_cfa_offset 32
+; P8-NEXT: .cfi_offset lr, 16
+; P8-NEXT: bl __trunctfhf2
+; P8-NEXT: nop
+; P8-NEXT: clrldi r3, r3, 48
+; P8-NEXT: bl __gnu_h2f_ieee
+; P8-NEXT: nop
+; P8-NEXT: addi r1, r1, 32
+; P8-NEXT: ld r0, 16(r1)
+; P8-NEXT: mtlr r0
+; P8-NEXT: blr
+;
+; CHECK-LABEL: trunc:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xscvqpdp v2, v2
+; CHECK-NEXT: xscvdphp f0, vs34
+; CHECK-NEXT: mffprwz r3, f0
+; CHECK-NEXT: clrlwi r3, r3, 16
+; CHECK-NEXT: mtfprwz f0, r3
+; CHECK-NEXT: xscvhpdp f1, f0
+; CHECK-NEXT: blr
+;
+; SOFT-LABEL: trunc:
+; SOFT: # %bb.0: # %entry
+; SOFT-NEXT: mflr r0
+; SOFT-NEXT: stdu r1, -32(r1)
+; SOFT-NEXT: std r0, 48(r1)
+; SOFT-NEXT: .cfi_def_cfa_offset 32
+; SOFT-NEXT: .cfi_offset lr, 16
+; SOFT-NEXT: bl __trunctfhf2
+; SOFT-NEXT: nop
+; SOFT-NEXT: clrldi r3, r3, 48
+; SOFT-NEXT: bl __gnu_h2f_ieee
+; SOFT-NEXT: nop
+; SOFT-NEXT: bl __gnu_f2h_ieee
+; SOFT-NEXT: nop
+; SOFT-NEXT: addi r1, r1, 32
+; SOFT-NEXT: ld r0, 16(r1)
+; SOFT-NEXT: mtlr r0
+; SOFT-NEXT: blr
+entry:
+ %0 = fptrunc fp128 %a to half
+ ret half %0
+}
+
+define fp128 @ext(half %a) unnamed_addr {
+; P8-LABEL: ext:
+; P8: # %bb.0: # %entry
+; P8-NEXT: mflr r0
+; P8-NEXT: stdu r1, -32(r1)
+; P8-NEXT: std r0, 48(r1)
+; P8-NEXT: .cfi_def_cfa_offset 32
+; P8-NEXT: .cfi_offset lr, 16
+; P8-NEXT: bl __extendsfkf2
+; P8-NEXT: nop
+; P8-NEXT: addi r1, r1, 32
+; P8-NEXT: ld r0, 16(r1)
+; P8-NEXT: mtlr r0
+; P8-NEXT: blr
+;
+; CHECK-LABEL: ext:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xscpsgndp vs34, f1, f1
+; CHECK-NEXT: xscvdpqp v2, v2
+; CHECK-NEXT: blr
+;
+; SOFT-LABEL: ext:
+; SOFT: # %bb.0: # %entry
+; SOFT-NEXT: mflr r0
+; SOFT-NEXT: stdu r1, -32(r1)
+; SOFT-NEXT: std r0, 48(r1)
+; SOFT-NEXT: .cfi_def_cfa_offset 32
+; SOFT-NEXT: .cfi_offset lr, 16
+; SOFT-NEXT: clrldi r3, r3, 48
+; SOFT-NEXT: bl __gnu_h2f_ieee
+; SOFT-NEXT: nop
+; SOFT-NEXT: bl __extendsfkf2
+; SOFT-NEXT: nop
+; SOFT-NEXT: addi r1, r1, 32
+; SOFT-NEXT: ld r0, 16(r1)
+; SOFT-NEXT: mtlr r0
+; SOFT-NEXT: blr
+entry:
+ %0 = fpext half %a to fp128
+ ret fp128 %0
+}
|
Should tests at |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For truncation, if you convert f128->f64->f16, you'll round twice, and therefore get the wrong result in some cases. You need to ensure you only round once.
…conversion) instead of HW instructions (stepwise conversion).
Thanks Eli @efriedma-quic ! This is a great find. I just realized that the intermediate rounding during the stepwise conversion (f128->f64->f16) can produce a different result compared to the direct conversion (`f128->f16). |
Thank you @tgross35. Sorry that I missed the comment before. I've updated the 2 tests and removed llvm/test/CodeGen/PowerPC/f16-to-from-f128.ll, since it was redundant then. |
Gentle ping. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks reasonable to me except some nits which can be addressed in the merge commit. Thanks for fixing this.
; P8-NEXT: mflr r0 | ||
; P8-NEXT: stdu r1, -32(r1) | ||
; P8-NEXT: std r0, 48(r1) | ||
; P8-NEXT: .cfi_def_cfa_offset 32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: add nounwind
attribute to avoid such instrucitons
; SOFT-NEXT: mtlr r0 | ||
; SOFT-NEXT: blr | ||
entry: | ||
%0 = fptrunc fp128 %a to half |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: maybe we should avoid this unnamed instruction.
; SOFT-NEXT: mtlr r0 | ||
; SOFT-NEXT: blr | ||
entry: | ||
%0 = fpext half %a to fp128 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: maybe we should avoid this unnamed instruction.
ret half %0 | ||
} | ||
|
||
define fp128 @ext(half %a) unnamed_addr { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Is the unnamed_addr
needed?
; CHECK-P8-NEXT: mtlr r0 | ||
; CHECK-P8-NEXT: blr | ||
entry: | ||
%0 = fptrunc fp128 %a to half |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: We'd better avoid the unnamed variable names, like %0
. (Suggested by https://llvm.org/docs/TestingGuide.html#writing-new-regression-tests)
@@ -210,13 +210,19 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, | |||
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand); | |||
} | |||
|
|||
setTruncStoreAction(MVT::f128, MVT::f16, Expand); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: could you please add a comment here why we can not use PWR9 instructions to do the conversion.
This patch enables conversion between f16 and f128, expanding on pre-Power9 targets and using HW instructions on Power9.
Fixes #92866