Skip to content

Commit

Permalink
[AArch64] Add intrinsics for SME FP8 FDOT single and multi instructio…
Browse files Browse the repository at this point in the history
…ns (#119845)

Add support for the following SME 8 bit floating-point dot-product intrinsics:

```
// Only if __ARM_FEATURE_SME_F8F16 != 0
void svdot[_single]_za16[_mf8]_vg1x2_fpm(uint32_t slice, svmfloat8x2_t zn,
                                         svmfloat8_t zm,
                                         fpm_t fpm) __arm_streaming __arm_inout("za");

void svdot[_single]_za16[_mf8]_vg1x4_fpm(uint32_t slice, svmfloat8x4_t zn,
                                         svmfloat8_t zm,
                                         fpm_t fpm) __arm_streaming __arm_inout("za");

void svdot_za16[_mf8]_vg1x2_fpm(uint32_t slice, svmfloat8x2_t zn,
                                svmfloat8x2_t zm,
                                fpm_t fpm) __arm_streaming __arm_inout("za");

void svdot_za16[_mf8]_vg1x4_fpm(uint32_t slice, svmfloat8x4_t zn,
                                svmfloat8x4_t zm,
                                fpm_t fpm) __arm_streaming __arm_inout("za");

// Only if __ARM_FEATURE_SME_F8F32 != 0
void svdot[_single]_za32[_mf8]_vg1x2_fpm(uint32_t slice, svmfloat8x2_t zn,
                                         svmfloat8_t zm,
                                         fpm_t fpm) __arm_streaming __arm_inout("za");

void svdot[_single]_za32[_mf8]_vg1x4_fpm(uint32_t slice, svmfloat8x4_t zn,
                                         svmfloat8_t zm,
                                         fpm_t fpm) __arm_streaming __arm_inout("za");

void svdot_za32[_mf8]_vg1x2_fpm(uint32_t slice, svmfloat8x2_t zn,
                                svmfloat8x2_t zm,
                                fpm_t fpm) __arm_streaming __arm_inout("za");

void svdot_za32[_mf8]_vg1x4_fpm(uint32_t slice, svmfloat8x4_t zn,
                                svmfloat8x4_t zm,
                                fpm_t fpm) __arm_streaming __arm_inout("za");
```

These intrinsics are extracted from:
ARM-software/acle#323

Co-authored-by: Momchil Velikov <momchil.velikov@arm.com>
Co-authored-by: Marian Lukac <marian.lukac@arm.com>
  • Loading branch information
3 people authored Dec 16, 2024
1 parent 7d25bce commit ef4b597
Show file tree
Hide file tree
Showing 7 changed files with 415 additions and 16 deletions.
12 changes: 12 additions & 0 deletions clang/include/clang/Basic/arm_sme.td
Original file line number Diff line number Diff line change
Expand Up @@ -748,11 +748,23 @@ let SMETargetGuard = "sme2" in {
let SMETargetGuard = "sme-f8f32" in {
def SVDOT_LANE_FP8_ZA32_VG1x2 : Inst<"svdot_lane_za32[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>;
def SVDOT_LANE_FP8_ZA32_VG1x4 : Inst<"svdot_lane_za32[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>;

def SVDOT_SINGLE_FP8_ZA32_VG1x2 : Inst<"svdot[_single]_za32[_mf8]_vg1x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
def SVDOT_SINGLE_FP8_ZA32_VG1x4 : Inst<"svdot[_single]_za32[_mf8]_vg1x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;

def SVDOT_MULTI_FP8_ZA32_VG1x2 : Inst<"svdot_za32[_mf8]_vg1x2_fpm", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
def SVDOT_MULTI_FP8_ZA32_VG1x4 : Inst<"svdot_za32[_mf8]_vg1x4_fpm", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
}

let SMETargetGuard = "sme-f8f16" in {
def SVDOT_LANE_FP8_ZA16_VG1x2 : Inst<"svdot_lane_za16[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
def SVDOT_LANE_FP8_ZA16_VG1x4 : Inst<"svdot_lane_za16[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;

def SVDOT_SINGLE_FP8_ZA16_VG1x2 : Inst<"svdot[_single]_za16[_mf8]_vg1x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
def SVDOT_SINGLE_FP8_ZA16_VG1x4 : Inst<"svdot[_single]_za16[_mf8]_vg1x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;

def SVDOT_MULTI_FP8_ZA16_VG1x2 : Inst<"svdot_za16[_mf8]_vg1x2_fpm", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
def SVDOT_MULTI_FP8_ZA16_VG1x4 : Inst<"svdot_za16[_mf8]_vg1x4_fpm", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
}

////////////////////////////////////////////////////////////////////////////////
Expand Down
174 changes: 167 additions & 7 deletions clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c

Large diffs are not rendered by default.

16 changes: 16 additions & 0 deletions clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fp8_fdot.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,22 @@ void test_features(uint32_t slice, svmfloat8_t f8, svmfloat8x2_t f8x2,
svdot_lane_za16_mf8_vg1x2_fpm(slice, f8x2, f8, 3, fpmr);
// expected-error@+1 {{'svdot_lane_za16_mf8_vg1x4_fpm' needs target feature sme,sme-f8f16}}
svdot_lane_za16_mf8_vg1x4_fpm(slice, f8x4, f8, 3, fpmr);
// expected-error@+1 {{'svdot_single_za32_mf8_vg1x2_fpm' needs target feature sme,sme-f8f32}}
svdot_single_za32_mf8_vg1x2_fpm(slice, f8x2, f8, fpmr);
// expected-error@+1 {{'svdot_single_za32_mf8_vg1x4_fpm' needs target feature sme,sme-f8f32}}
svdot_single_za32_mf8_vg1x4_fpm(slice, f8x4, f8, fpmr);
// expected-error@+1 {{'svdot_za32_mf8_vg1x2_fpm' needs target feature sme,sme-f8f32}}
svdot_za32_mf8_vg1x2_fpm(slice, f8x2, f8x2, fpmr);
// expected-error@+1 {{'svdot_za32_mf8_vg1x4_fpm' needs target feature sme,sme-f8f32}}
svdot_za32_mf8_vg1x4_fpm(slice, f8x4, f8x4, fpmr);
// expected-error@+1 {{'svdot_single_za16_mf8_vg1x2_fpm' needs target feature sme,sme-f8f16}}
svdot_single_za16_mf8_vg1x2_fpm(slice, f8x2, f8, fpmr);
// expected-error@+1 {{'svdot_single_za16_mf8_vg1x4_fpm' needs target feature sme,sme-f8f16}}
svdot_single_za16_mf8_vg1x4_fpm(slice, f8x4, f8, fpmr);
// expected-error@+1 {{'svdot_za16_mf8_vg1x2_fpm' needs target feature sme,sme-f8f16}}
svdot_za16_mf8_vg1x2_fpm(slice, f8x2, f8x2, fpmr);
// expected-error@+1 {{'svdot_za16_mf8_vg1x4_fpm' needs target feature sme,sme-f8f16}}
svdot_za16_mf8_vg1x4_fpm(slice, f8x4, f8x4, fpmr);
}

void test_imm(uint32_t slice, svmfloat8_t f8, svmfloat8x2_t f8x2,
Expand Down
36 changes: 36 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAArch64.td
Original file line number Diff line number Diff line change
Expand Up @@ -3874,11 +3874,47 @@ class SME2_FP8_FDOT_LANE_VG1x4 :
llvm_i32_ty],
[IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<6>>]>;

class SME2_FP8_FDOT_SINGLE_VG1x2 :
DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty],
[IntrInaccessibleMemOnly, IntrHasSideEffects]>;

class SME2_FP8_FDOT_SINGLE_VG1x4 :
DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty],
[IntrInaccessibleMemOnly, IntrHasSideEffects]>;

class SME2_FP8_FDOT_MULTI_VG1x2 :
DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty],
[IntrInaccessibleMemOnly, IntrHasSideEffects]>;

class SME2_FP8_FDOT_MULTI_VG1x4 :
DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
[IntrInaccessibleMemOnly, IntrHasSideEffects]>;

def int_aarch64_sme_fp8_fdot_lane_za16_vg1x2 : SME2_FP8_FDOT_LANE_VG1x2;
def int_aarch64_sme_fp8_fdot_lane_za16_vg1x4 : SME2_FP8_FDOT_LANE_VG1x4;

def int_aarch64_sme_fp8_fdot_lane_za32_vg1x2 : SME2_FP8_FDOT_LANE_VG1x2;
def int_aarch64_sme_fp8_fdot_lane_za32_vg1x4 : SME2_FP8_FDOT_LANE_VG1x4;

def int_aarch64_sme_fp8_fdot_single_za16_vg1x2 : SME2_FP8_FDOT_SINGLE_VG1x2;
def int_aarch64_sme_fp8_fdot_single_za16_vg1x4 : SME2_FP8_FDOT_SINGLE_VG1x4;

def int_aarch64_sme_fp8_fdot_single_za32_vg1x2 : SME2_FP8_FDOT_SINGLE_VG1x2;
def int_aarch64_sme_fp8_fdot_single_za32_vg1x4 : SME2_FP8_FDOT_SINGLE_VG1x4;

def int_aarch64_sme_fp8_fdot_multi_za16_vg1x2 : SME2_FP8_FDOT_MULTI_VG1x2;
def int_aarch64_sme_fp8_fdot_multi_za16_vg1x4 : SME2_FP8_FDOT_MULTI_VG1x4;

def int_aarch64_sme_fp8_fdot_multi_za32_vg1x2 : SME2_FP8_FDOT_MULTI_VG1x2;
def int_aarch64_sme_fp8_fdot_multi_za32_vg1x4 : SME2_FP8_FDOT_MULTI_VG1x4;
}

//
Expand Down
17 changes: 8 additions & 9 deletions llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -988,11 +988,11 @@ let Predicates = [HasSMEF8F16] in {
defm FVDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fvdot", 0b11, 0b110, ZZ_b_mul_r, ZPR4b8>;
defm FDOT_VG2_M2ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x2<"fdot", int_aarch64_sme_fp8_fdot_lane_za16_vg1x2>;
defm FDOT_VG4_M4ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x4<"fdot", int_aarch64_sme_fp8_fdot_lane_za16_vg1x4>;
defm FDOT_VG2_M2ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010001, MatrixOp16, ZZ_b, ZPR4b8>;
defm FDOT_VG4_M4ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110001, MatrixOp16, ZZZZ_b, ZPR4b8>;

defm FDOT_VG2_M2Z2Z_BtoH : sme2_dot_mla_add_sub_array_vg2_multi<"fdot", 0b0100100, MatrixOp16, ZZ_b_mul_r, nxv16i8, null_frag>;
defm FDOT_VG4_M4Z4Z_BtoH : sme2_dot_mla_add_sub_array_vg4_multi<"fdot", 0b0100100, MatrixOp16, ZZZZ_b_mul_r, nxv16i8, null_frag>;
defm FDOT_VG2_M2ZZ_BtoH : sme2_fp8_fdot_single_vg1x2<"fdot", 0b0010001, MatrixOp16, int_aarch64_sme_fp8_fdot_single_za16_vg1x2>;
defm FDOT_VG4_M4ZZ_BtoH : sme2_fp8_fdot_single_vg1x4<"fdot", 0b0110001, MatrixOp16, int_aarch64_sme_fp8_fdot_single_za16_vg1x4>;
defm FDOT_VG2_M2Z2Z_BtoH : sme2_fp8_fdot_multi_vg1x2 <"fdot", 0b0100100, MatrixOp16, int_aarch64_sme_fp8_fdot_multi_za16_vg1x2>;
defm FDOT_VG4_M4Z4Z_BtoH : sme2_fp8_fdot_multi_vg1x4 <"fdot", 0b0100100, MatrixOp16, int_aarch64_sme_fp8_fdot_multi_za16_vg1x4>;

def FMLAL_MZZI_BtoH : sme2_mla_ll_array_index_16b<"fmlal", 0b11, 0b00>;
defm FMLAL_VG2_M2ZZI_BtoH : sme2_multi_vec_array_vg2_index_16b<"fmlal", 0b10, 0b111>;
Expand All @@ -1011,11 +1011,10 @@ let Predicates = [HasSMEF8F32] in {
defm FDOT_VG2_M2ZZI_BtoS : sme2_fp8_fdot_index_za32_vg1x2<"fdot", int_aarch64_sme_fp8_fdot_lane_za32_vg1x2>;
defm FDOT_VG4_M4ZZI_BtoS : sme2_fp8_fdot_index_za32_vg1x4<"fdot", int_aarch64_sme_fp8_fdot_lane_za32_vg1x4>;

defm FDOT_VG2_M2ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010011, MatrixOp32, ZZ_b, ZPR4b8>;
defm FDOT_VG4_M4ZZ_BtoS : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110011, MatrixOp32, ZZZZ_b, ZPR4b8>;

defm FDOT_VG2_M2Z2Z_BtoS : sme2_dot_mla_add_sub_array_vg2_multi<"fdot", 0b0100110, MatrixOp32, ZZ_b_mul_r, nxv16i8, null_frag>;
defm FDOT_VG4_M4Z4Z_BtoS : sme2_dot_mla_add_sub_array_vg4_multi<"fdot", 0b0100110, MatrixOp32, ZZZZ_b_mul_r, nxv16i8, null_frag>;
defm FDOT_VG2_M2ZZ_BtoS : sme2_fp8_fdot_single_vg1x2<"fdot", 0b0010011, MatrixOp32, int_aarch64_sme_fp8_fdot_single_za32_vg1x2>;
defm FDOT_VG4_M4ZZ_BtoS : sme2_fp8_fdot_single_vg1x4<"fdot", 0b0110011, MatrixOp32, int_aarch64_sme_fp8_fdot_single_za32_vg1x4>;
defm FDOT_VG2_M2Z2Z_BtoS : sme2_fp8_fdot_multi_vg1x2 <"fdot", 0b0100110, MatrixOp32, int_aarch64_sme_fp8_fdot_multi_za32_vg1x2>;
defm FDOT_VG4_M4Z4Z_BtoS : sme2_fp8_fdot_multi_vg1x4 <"fdot", 0b0100110, MatrixOp32, int_aarch64_sme_fp8_fdot_multi_za32_vg1x4>;

def FVDOTB_VG4_M2ZZI_BtoS : sme2_fp8_multi_vec_array_vg4_index<"fvdotb", 0b0>;
def FVDOTT_VG4_M2ZZI_BtoS : sme2_fp8_multi_vec_array_vg4_index<"fvdott", 0b1>;
Expand Down
64 changes: 64 additions & 0 deletions llvm/lib/Target/AArch64/SMEInstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -5882,3 +5882,67 @@ multiclass sme2_fp8_fdot_index_za32_vg1x4<string mnemonic,

def : SME2_ZA_TwoOp_VG4_Multi_Index_Pat<NAME, intrinsic, sme_elm_idx0_7, ZPR4b8, nxv16i8, VectorIndexS32b_timm, tileslice16>;
}

multiclass sme2_fp8_fdot_single_vg1x2<string mnemonic, bits<7> op,
MatrixOperand matrix_op,
SDPatternOperator intrinsic> {
def NAME: sme2_dot_mla_add_sub_array_vg24_single<op, matrix_op, ZZ_b, ZPR4b8, mnemonic>,
SMEPseudo2Instr<NAME, 1> {
let Uses=[FPMR, FPCR];
}

def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm",
(!cast<Instruction>(NAME) matrix_op:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, ZZ_b:$Zn, ZPR4b8:$Zm), 0>;

def _PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME, sme_elm_idx0_7, ZZ_b, ZPR4b8, SMEMatrixArray>;

def : SME2_ZA_TwoOp_VG2_Multi_Single_Pat<NAME, intrinsic, sme_elm_idx0_7, ZPR4b8, nxv16i8, tileslice16>;
}

multiclass sme2_fp8_fdot_single_vg1x4<string mnemonic, bits<7> op,
MatrixOperand matrix_op,
SDPatternOperator intrinsic> {
def NAME: sme2_dot_mla_add_sub_array_vg24_single<op, matrix_op, ZZZZ_b, ZPR4b8, mnemonic>,
SMEPseudo2Instr<NAME, 1> {
let Uses=[FPMR, FPCR];
}

def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm",
(!cast<Instruction>(NAME) matrix_op:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, ZZZZ_b:$Zn, ZPR4b8:$Zm), 0>;

def _PSEUDO : sme2_za_array_2op_multi_single_pseudo<NAME, sme_elm_idx0_7, ZZZZ_b, ZPR4b8, SMEMatrixArray>;

def : SME2_ZA_TwoOp_VG4_Multi_Single_Pat<NAME, intrinsic, sme_elm_idx0_7, ZPR4b8, nxv16i8, tileslice16>;
}

multiclass sme2_fp8_fdot_multi_vg1x2<string mnemonic, bits<7> op,
MatrixOperand matrix_op,
SDPatternOperator intrinsic> {
def NAME : sme2_dot_mla_add_sub_array_vg2_multi<op, matrix_op, ZZ_b_mul_r, mnemonic>,
SMEPseudo2Instr<NAME, 1> {
let Uses=[FPMR, FPCR];
}

def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm",
(!cast<Instruction>(NAME) matrix_op:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, ZZ_b_mul_r:$Zn, ZZ_b_mul_r:$Zm), 0>;

def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME, sme_elm_idx0_7, ZZ_b_mul_r, SMEMatrixArray>;

def : SME2_ZA_TwoOp_VG2_Multi_Multi_Pat<NAME, intrinsic, sme_elm_idx0_7, nxv16i8, tileslice16>;
}

multiclass sme2_fp8_fdot_multi_vg1x4<string mnemonic, bits<7> op,
MatrixOperand matrix_op,
SDPatternOperator intrinsic> {
def NAME : sme2_dot_mla_add_sub_array_vg4_multi<op, matrix_op, ZZZZ_b_mul_r, mnemonic>,
SMEPseudo2Instr<NAME, 1> {
let Uses=[FPMR, FPCR];
}

def : InstAlias<mnemonic # "\t$ZAd[$Rv, $imm3], $Zn, $Zm",
(!cast<Instruction>(NAME) matrix_op:$ZAd, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, ZZZZ_b_mul_r:$Zn, ZZZZ_b_mul_r:$Zm), 0>;

def _PSEUDO : sme2_za_array_2op_multi_multi_pseudo<NAME, sme_elm_idx0_7, ZZZZ_b_mul_r, SMEMatrixArray>;

def : SME2_ZA_TwoOp_VG4_Multi_Multi_Pat<NAME, intrinsic, sme_elm_idx0_7, nxv16i8, tileslice16>;
}
112 changes: 112 additions & 0 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-fp8-fdot.ll
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,116 @@ define void @test_fdot32_1x4_indexed(i32 %slice.0,
ret void
}

define void @test_fdot32_1x2_single(i32 %slice.0,
; CHECK-LABEL: test_fdot32_1x2_single:
; CHECK: mov w8, w0
; CHECK: fdot za.s[w8, 7, vgx2], { z0.b, z1.b }, z2.b
; CHECK: ret
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
<vscale x 16 x i8> %zm) #0 {
%slice = add i32 %slice.0, 7
call void @llvm.aarch64.sme.fp8.fdot.single.za32.vg1x2(i32 %slice,
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
<vscale x 16 x i8> %zm)
ret void
}

define void @test_fdot32_1x4_single(i32 %slice.0,
; CHECK-LABEL: test_fdot32_1x4_single:
; CHECK: mov w8, w0
; CHECK: fdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b
; CHECK: ret
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
<vscale x 16 x i8> %zm) #0 {
%slice = add i32 %slice.0, 7
call void @llvm.aarch64.sme.fp8.fdot.single.za32.vg1x4(i32 %slice,
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
<vscale x 16 x i8> %zm)
ret void
}

define void @test_fdot32_1x2_multi(i32 %slice.0,
; CHECK-LABEL: test_fdot32_1x2_multi:
; CHECK: mov w8, w0
; CHECK: fdot za.s[w8, 7, vgx2], { z0.b, z1.b }, { z2.b, z3.b }
; CHECK: ret
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
<vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 {
%slice = add i32 %slice.0, 7
call void @llvm.aarch64.sme.fp8.fdot.multi.za32.vg1x2(i32 %slice,
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
<vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
ret void
}

define void @test_fdot32_1x4_multi(i32 %slice.0,
; CHECK-LABEL: test_fdot32_1x4_multi:
; CHECK: mov w8, w0
; CHECK: fdot za.s[w8, 7, vgx4], { z0.b - z3.b }, { z4.b - z7.b }
; CHECK: ret
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
<vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) #0 {
%slice = add i32 %slice.0, 7
call void @llvm.aarch64.sme.fp8.fdot.multi.za32.vg1x4(i32 %slice,
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
<vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4)
ret void
}

define void @test_fdot16_1x2_single(i32 %slice.0,
; CHECK-LABEL: test_fdot16_1x2_single:
; CHECK: mov w8, w0
; CHECK: fdot za.h[w8, 7, vgx2], { z0.b, z1.b }, z2.b
; CHECK: ret
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
<vscale x 16 x i8> %zm) #0 {
%slice = add i32 %slice.0, 7
call void @llvm.aarch64.sme.fp8.fdot.single.za16.vg1x2(i32 %slice,
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
<vscale x 16 x i8> %zm)
ret void
}

define void @test_fdot16_1x4_single(i32 %slice.0,
; CHECK-LABEL: test_fdot16_1x4_single:
; CHECK: mov w8, w0
; CHECK: fdot za.h[w8, 7, vgx4], { z0.b - z3.b }, z4.b
; CHECK: ret
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
<vscale x 16 x i8> %zm) #0 {
%slice = add i32 %slice.0, 7
call void @llvm.aarch64.sme.fp8.fdot.single.za16.vg1x4(i32 %slice,
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
<vscale x 16 x i8> %zm)
ret void
}

define void @test_fdot16_1x2_multi(i32 %slice.0,
; CHECK-LABEL: test_fdot16_1x2_multi:
; CHECK: mov w8, w0
; CHECK: fdot za.h[w8, 7, vgx2], { z0.b, z1.b }, { z2.b, z3.b }
; CHECK: ret
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
<vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 {
%slice = add i32 %slice.0, 7
call void @llvm.aarch64.sme.fp8.fdot.multi.za16.vg1x2(i32 %slice,
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
<vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
ret void
}

define void @test_fdot16_1x4_multi(i32 %slice.0,
; CHECK-LABEL: test_fdot16_1x4_multi:
; CHECK: mov w8, w0
; CHECK: fdot za.h[w8, 7, vgx4], { z0.b - z3.b }, { z4.b - z7.b }
; CHECK: ret
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
<vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) #0 {
%slice = add i32 %slice.0, 7
call void @llvm.aarch64.sme.fp8.fdot.multi.za16.vg1x4(i32 %slice,
<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
<vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4)
ret void
}

attributes #0 = { "target-features" = "+sme,+sme-f8f32,+sme-f8f16" }

0 comments on commit ef4b597

Please sign in to comment.