-
Notifications
You must be signed in to change notification settings - Fork 12.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Add intrinsics for SME FP8 FDOT LANE instructions #118492
Conversation
Co-authored-by: Momchil Velikov <momchil.velikov@arm.com> Co-authored-by: Marian Lukac <marian.lukac@arm.com> Co-authored-by: Caroline Concatto <caroline.concatto@arm.com>
@llvm/pr-subscribers-clang @llvm/pr-subscribers-clang-codegen Author: Jonathan Thackray (jthackray) ChangesPatch is 20.65 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118492.diff 8 Files Affected:
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 0f689e82bdb742..87ed68c03430cd 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -740,6 +740,11 @@ let SMETargetGuard = "sme2" in {
def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
}
+// FDOT
+let SMETargetGuard = "sme2,sme-f8f16" in {
+ def SVDOT_LANE_FP8_ZA16_VG1x2 : Inst<"svdot_lane_za16[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
+ def SVDOT_LANE_FP8_ZA16_VG1x4 : Inst<"svdot_lane_za16[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
+}
////////////////////////////////////////////////////////////////////////////////
// SME2p1 - FMOPA, FMOPS (non-widening)
let SMETargetGuard = "sme-b16b16" in {
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index de10be7bdce0db..e7cc40db7dca6c 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -52,6 +52,7 @@ include "arm_immcheck_incl.td"
// h: half-float
// d: double
// b: bfloat
+// m: mfloat8
// Typespec modifiers
// ------------------
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 7588f8427cdd38..7de5e8bcd439d7 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10183,6 +10183,8 @@ CodeGenFunction::getSVEType(const SVETypeFlags &TypeFlags) {
case SVETypeFlags::EltTyInt64:
return llvm::ScalableVectorType::get(Builder.getInt64Ty(), 2);
+ case SVETypeFlags::EltTyMFloat8:
+ return llvm::ScalableVectorType::get(Builder.getInt8Ty(), 16);
case SVETypeFlags::EltTyFloat16:
return llvm::ScalableVectorType::get(Builder.getHalfTy(), 8);
case SVETypeFlags::EltTyBFloat16:
@@ -11234,6 +11236,10 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
BuiltinID == SME::BI__builtin_sme_svstr_za)
return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);
+ // Emit set FPMR for intrinsics that require it
+ if (TypeFlags.setsFPMR())
+ Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr),
+ Ops.pop_back_val());
// Handle builtins which require their multi-vector operands to be swapped
swapCommutativeSMEOperands(BuiltinID, Ops);
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
new file mode 100644
index 00000000000000..999b1940df80c4
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
@@ -0,0 +1,57 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: aarch64-registered-target
+#include <arm_sme.h>
+
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: define dso_local void @test_svdot_lane_za16_f8_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 3)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z29test_svdot_lane_za16_f8_vg1x2j13svmfloat8x2_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 3)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svdot_lane_za16_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn,
+ svmfloat8_t zm, fpm_t fpmr)
+ __arm_streaming __arm_inout("za") {
+ SVE_ACLE_FUNC(svdot_lane_za16,_mf8,_vg1x2_fpm)(slice, zn, zm, 3, fpmr);
+}
+
+
+// CHECK-LABEL: define dso_local void @test_svdot_lane_za16_f8_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]], i32 3)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z29test_svdot_lane_za16_f8_vg1x4j13svmfloat8x4_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]], i32 3)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svdot_lane_za16_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn,
+ svmfloat8_t zm, fpm_t fpmr)
+ __arm_streaming __arm_inout("za") {
+ SVE_ACLE_FUNC(svdot_lane_za16,_mf8,_vg1x4_fpm)(slice, zn, zm, 3, fpmr);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index a91616b9556828..c4ce8c1917c9b6 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3864,3 +3864,25 @@ def int_aarch64_sve_famin_u : AdvSIMD_Pred2VectorArg_Intrinsic;
// Neon absolute maximum and minimum
def int_aarch64_neon_famax : AdvSIMD_2VectorArg_Intrinsic;
def int_aarch64_neon_famin : AdvSIMD_2VectorArg_Intrinsic;
+
+// SME FDOT instructions
+let TargetPrefix = "aarch64" in {
+
+
+class SME2_FP8_FDOT_LANE_VG1x2 :
+ DefaultAttrsIntrinsic<[], [llvm_i32_ty,
+ llvm_nxv16i8_ty, llvm_nxv16i8_ty,
+ llvm_nxv16i8_ty,
+ llvm_i32_ty],
+ [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<4>>]>;
+
+class SME2_FP8_FDOT_LANE_VG1x4 :
+ DefaultAttrsIntrinsic<[], [llvm_i32_ty,
+ llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
+ llvm_nxv16i8_ty,
+ llvm_i32_ty],
+ [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<6>>]>;
+
+ def int_aarch64_sme_fp8_fdot_lane_za16_vg1x2 : SME2_FP8_FDOT_LANE_VG1x2;
+ def int_aarch64_sme_fp8_fdot_lane_za16_vg1x4 : SME2_FP8_FDOT_LANE_VG1x4;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 37ac915d1d8808..f02a4b7bdbfaa8 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -986,8 +986,8 @@ def LUTI4_S_4ZZT2Z : sme2_luti4_vector_vg4_strided<0b00, 0b00, "luti4">;
let Predicates = [HasSMEF8F16] in {
defm FVDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fvdot", 0b11, 0b110, ZZ_b_mul_r, ZPR4b8>;
-defm FDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fdot", 0b11, 0b010, ZZ_b_mul_r, ZPR4b8>;
-defm FDOT_VG4_M4ZZI_BtoH : sme2p1_multi_vec_array_vg4_index_f8f16<"fdot", 0b100, ZZZZ_b_mul_r, ZPR4b8>;
+defm FDOT_VG2_M2ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x2<"fdot", 0b11, 0b010, ZZ_b_mul_r, int_aarch64_sme_fp8_fdot_lane_za16_vg1x2>;
+defm FDOT_VG4_M4ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x4<"fdot", 0b100, ZZZZ_b_mul_r, int_aarch64_sme_fp8_fdot_lane_za16_vg1x4>;
defm FDOT_VG2_M2ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010001, MatrixOp16, ZZ_b, ZPR4b8>;
defm FDOT_VG4_M4ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110001, MatrixOp16, ZZZZ_b, ZPR4b8>;
// TODO: Replace nxv16i8 by nxv16f8
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 776472e72af05a..4b68df1b5ff29b 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -219,6 +219,37 @@ class SME2_Tile_Movaz_Pat<string name, SDPatternOperator intrinsic, ValueType ou
: Pat<(out_vt (intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$base, index_ty:$offset)))),
(!cast<Instruction>(name # _PSEUDO) $tile, $base, $offset)>;
+
+// FP8 SME FDOT instructions
+
+// Selection DAG patterns - map to first level of pseudo-instructions (xxx_PSEUDO)
+class SME2_FP8_FMLA_FDOT_Index_VG1x2_Pat<string name, SDPatternOperator intrinsic,
+ ComplexPattern tileslice, Operand offset_ty, Operand imm_ty,
+ ValueType vt = nxv16i8>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset)),
+ vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset,
+ (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1),
+ ZPR4b8:$Zm, imm_ty:$i)>;
+
+class SME2_FP8_FMLA_FDOT_Index_VG1x4_Pat<string name, SDPatternOperator intrinsic,
+ ComplexPattern tileslice, Operand offset_ty, Operand imm_ty,
+ ValueType vt = nxv16i8>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset)),
+ vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4,
+ vt:$Zm, (i32 imm_ty:$i)),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset,
+ (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
+ ZPR4b8:$Zm, imm_ty:$i)>;
+
+class sme2_fp8_fmla_fdot_index_pseudo<string name, Operand offset_ty, RegisterOperand src1_ty, RegisterOperand src2_ty, Operand imm_ty>
+ : SMEPseudo2Instr<name, 0>,
+ Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs, src1_ty:$Zn, src2_ty:$Zm, imm_ty:$i), []> {
+ let mayLoad = 1;
+ let SMEMatrixType = SMEMatrixArray;
+ let usesCustomInserter = 1;
+}
+
//===----------------------------------------------------------------------===//
// SME pattern match helpers.
//===----------------------------------------------------------------------===//
@@ -5737,3 +5768,113 @@ multiclass sme2_fmop4a_fp8_fp16_2way<string mnemonic> {
// Multiple vectors
def _M2Z2Z_BtoH : sme2_fp8_fp16_quarter_tile_outer_product<0b1, 0b1, mnemonic, ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi>;
}
+
+// FP8 SME FDOT instructions
+
+// Selection DAG patterns - map to first level of pseudo-instructions (xxx_PSEUDO)
+
+class SME2_FP8_FDOT_Index_VG1x2_Pat<string name, SDPatternOperator intrinsic,
+ ComplexPattern tileslice, Operand offset_ty, Operand imm_ty,
+ ValueType vt = nxv16i8>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset)),
+ vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i), i64:$fpmr),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset,
+ (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1),
+ ZPR4b8:$Zm, imm_ty:$i, GPR64:$fpmr)>;
+
+class SME2_FP8_FDOT_Index_VG1x4_Pat<string name, SDPatternOperator intrinsic,
+ ComplexPattern tileslice, Operand offset_ty, Operand imm_ty,
+ ValueType vt = nxv16i8>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset)),
+ vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4,
+ vt:$Zm, (i32 imm_ty:$i), i64:$fpmr),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset,
+ (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
+ ZPR4b8:$Zm, imm_ty:$i, GPR64:$fpmr)>;
+
+// First level pseudo-instructions (xxx_PSEUDO) - transformed to second level pseudo-instructions (xxx_FPMR_PSEUDO)
+// during instruction selection.
+class sme2_fp8_fdot_index_pseudo<string name, Operand offset_ty, RegisterOperand src1_ty, RegisterOperand src2_ty, Operand imm_ty>
+ : SMEPseudo2Instr<name, 0>,
+ Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs, src1_ty:$Zn, src2_ty:$Zm, imm_ty:$i, GPR64:$fpmr), []> {
+ let SMEMatrixType = SMEMatrixArray;
+ let usesCustomInserter = 1;
+}
+
+class sme2_fp8_fdot_pseudo<string name, Operand offset_ty, RegisterOperand src1_ty, RegisterOperand src2_ty>
+ : SMEPseudo2Instr<name, 0>,
+ Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs, src1_ty:$Zn, src2_ty:$Zm, GPR64:$fpmr), []> {
+ let SMEMatrixType = SMEMatrixArray;
+ let usesCustomInserter = 1;
+}
+
+// Second level pseudo-instruction - expanded to real instruction by the AArch64 pseudo instruction expansion pass
+class sme2_fp8_fdot_index_fpmr_pseudo<string name, MatrixOperand matrix_ty, Operand offset_ty,
+ RegisterOperand src1_ty, RegisterOperand src2_ty,
+ Operand imm_ty>
+ : Pseudo<(outs matrix_ty:$ZAda),
+ (ins matrix_ty:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs,
+ src1_ty:$Zn, src2_ty:$Zm, imm_ty:$i, GPR64:$fpmr), []>,
+ SMEPseudo2Instr<name, 1> {
+ let hasNoSchedulingInfo = 1;
+ let Constraints = "$ZAda = $_ZAda";
+}
+
+class sme2_fp8_fdot_fpmr_pseudo<string name, MatrixOperand matrix_ty, Operand offset_ty,
+ RegisterOperand src1_ty, RegisterOperand src2_ty>
+ : Pseudo<(outs matrix_ty:$ZAda),
+ (ins matrix_ty:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs,
+ src1_ty:$Zn, src2_ty:$Zm, GPR64:$fpmr), []>,
+ SMEPseudo2Instr<name, 1> {
+ let hasNoSchedulingInfo = 1;
+ let Constraints = "$ZAda = $_ZAda";
+}
+
+// FDOT instructions
+multiclass sme2_fp8_fdot_index_za16_vg1x2<string mnemonic, bits<2> sz, bits<3> op,
+ RegisterOperand multi_vector_ty, SDPatternOperator intrinsic> {
+ def NAME : sme2_multi_vec_array_vg2_index<sz, {op{2},?,?,op{1-0},?}, MatrixOp16,
+ multi_vector_ty, ZPR4b8,
+ VectorIndexH32b_timm, mnemonic>,
+ SMEPseudo2Instr<NAME, 1>{
+ let Uses=[FPMR, FPCR];
+ let mayLoad = 1;
+
+ bits<3> i;
+ let Inst{11-10} = i{2-1};
+ let Inst{3} = i{0};
+ }
+
+ def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
+ (!cast<Instruction>(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
+ multi_vector_ty:$Zn, ZPR4b8:$Zm, VectorIndexH32b_timm:$i), 0>;
+
+
+ def _PSEUDO : sme2_fp8_fmla_fdot_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, ZPR4b8, VectorIndexH32b_timm>;
+
+ def : SME2_FP8_FMLA_FDOT_Index_VG1x2_Pat<NAME, intrinsic, tileslice16, sme_elm_idx0_7, VectorIndexH32b_timm>;
+}
+
+multiclass sme2_fp8_fdot_index_za16_vg1x4<string mnemonic, bits<3> op,
+ RegisterOperand multi_vector_ty, SDPatternOperator intrinsic> {
+ def NAME : sme2_multi_vec_array_vg4_index<0b0,{0b1,?,?,op,?}, MatrixOp16,
+ multi_vector_ty, ZPR4b8,
+ VectorIndexH32b_timm, mnemonic>,
+ SMEPseudo2Instr<NAME, 1> {
+ let Uses=[FPMR, FPCR];
+ let mayLoad = 1;
+
+ bits<3> i;
+ let Inst{11-10} = i{2-1};
+ let Inst{3} = i{0};
+ }
+
+ def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
+ (!cast<Instruction>(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv,
+ sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, ZPR4b8:$Zm, VectorIndexH32b_timm:$i), 0>;
+
+
+ def _PSEUDO : sme2_fp8_fmla_fdot_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, ZPR4b8, VectorIndexH32b_timm>;
+
+ def : SME2_FP8_FMLA_FDOT_Index_VG1x4_Pat<NAME, intrinsic, tileslice16, sme_elm_idx0_7, VectorIndexH32b_timm>;
+}
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp8-fdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp8-fdot.ll
new file mode 100644
index 00000000000000..138d9876fabda8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp8-fdot.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "^[ \t]*//.*$" --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-f8f16,+sme-f8f32 -verify-machineinstrs -force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define void @test_fdot16_1x2_indexed(i32 %slice.0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: test_fdot16_1x2_indexed:
+; CHECK: mov w8, w0
+; CHECK: fdot za.h[w8, 7, vgx2], { z0.b, z1.b }, z2.b[1]
+; CHECK: ret
+ %slice = add i32 %slice.0, 7
+ call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x2(i32 %slice,
+ <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
+ <vscale x 16 x i8> %zm, i32 1)
+ ret void
+}
+
+define void @test_fdot16_1x4_indexed(i32 %slice.0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <v...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you, Jonathan!
We would also want feature and immediate range tests - E.,g Immediate argument tests and feature requirement tests.
clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
Outdated
Show resolved
Hide resolved
@SpencerAbson, Now added. |
clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
Outdated
Show resolved
Hide resolved
clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
Outdated
Show resolved
Hide resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A few minor comments, thanks again.
Co-authored-by: SpencerAbson <Spencer.Abson@arm.com>
Thanks, code updated to address these. Much cleaner :) |
Add support for the following SME 8 bit floating-point dot-product intrinsics: