From d96a14a9284afd8722e5ad2840f97949f874e00b Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 17 Dec 2024 10:49:55 +0000 Subject: [PATCH] Add mf8 forms of vbsl, vluti2 and vluti4 vbsl has floating-point forms, since the intrinsic can be useful for blending two different sources. This patch adds mf8 forms too, with the same justification. Also, the FEAT_LUT support was done in parallel with the FP8 support. The 8-bit forms make sense for mf8 too. --- neon_intrinsics/advsimd.md | 64 +++++++++++-------- tools/intrinsic_db/advsimd.csv | 10 +++ tools/intrinsic_db/advsimd_classification.csv | 8 +++ 3 files changed, 54 insertions(+), 28 deletions(-) diff --git a/neon_intrinsics/advsimd.md b/neon_intrinsics/advsimd.md index 5506503f..efb9a03a 100644 --- a/neon_intrinsics/advsimd.md +++ b/neon_intrinsics/advsimd.md @@ -3070,34 +3070,36 @@ The intrinsics in this section are guarded by the macro ``__ARM_NEON``. #### Bitwise select -| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | -|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------|----------------------------|--------------------|---------------------------| -| int8x8_t vbsl_s8(
     uint8x8_t a,
     int8x8_t b,
     int8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| int8x16_t vbslq_s8(
     uint8x16_t a,
     int8x16_t b,
     int8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| int16x4_t vbsl_s16(
     uint16x4_t a,
     int16x4_t b,
     int16x4_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| int16x8_t vbslq_s16(
     uint16x8_t a,
     int16x8_t b,
     int16x8_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| int32x2_t vbsl_s32(
     uint32x2_t a,
     int32x2_t b,
     int32x2_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| int32x4_t vbslq_s32(
     uint32x4_t a,
     int32x4_t b,
     int32x4_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| int64x1_t vbsl_s64(
     uint64x1_t a,
     int64x1_t b,
     int64x1_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| int64x2_t vbslq_s64(
     uint64x2_t a,
     int64x2_t b,
     int64x2_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| uint8x8_t vbsl_u8(
     uint8x8_t a,
     uint8x8_t b,
     uint8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| uint8x16_t vbslq_u8(
     uint8x16_t a,
     uint8x16_t b,
     uint8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| uint16x4_t vbsl_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| uint16x8_t vbslq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| uint32x2_t vbsl_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| uint32x4_t vbslq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| uint64x1_t vbsl_u64(
     uint64x1_t a,
     uint64x1_t b,
     uint64x1_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| uint64x2_t vbslq_u64(
     uint64x2_t a,
     uint64x2_t b,
     uint64x2_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| poly64x1_t vbsl_p64(
     poly64x1_t a,
     poly64x1_t b,
     poly64x1_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `A32/A64` | -| poly64x2_t vbslq_p64(
     poly64x2_t a,
     poly64x2_t b,
     poly64x2_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `A32/A64` | -| float32x2_t vbsl_f32(
     uint32x2_t a,
     float32x2_t b,
     float32x2_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| float32x4_t vbslq_f32(
     uint32x4_t a,
     float32x4_t b,
     float32x4_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| poly8x8_t vbsl_p8(
     uint8x8_t a,
     poly8x8_t b,
     poly8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| poly8x16_t vbslq_p8(
     uint8x16_t a,
     poly8x16_t b,
     poly8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| poly16x4_t vbsl_p16(
     uint16x4_t a,
     poly16x4_t b,
     poly16x4_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | -| poly16x8_t vbslq_p16(
     uint16x8_t a,
     poly16x8_t b,
     poly16x8_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | -| float64x1_t vbsl_f64(
     uint64x1_t a,
     float64x1_t b,
     float64x1_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `A64` | -| float64x2_t vbslq_f64(
     uint64x2_t a,
     float64x2_t b,
     float64x2_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `A64` | +| Intrinsic | Argument preparation | AArch64 Instruction | Result | Supported architectures | +|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------|----------------------------|--------------------|---------------------------| +| int8x8_t vbsl_s8(
     uint8x8_t a,
     int8x8_t b,
     int8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| int8x16_t vbslq_s8(
     uint8x16_t a,
     int8x16_t b,
     int8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| int16x4_t vbsl_s16(
     uint16x4_t a,
     int16x4_t b,
     int16x4_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| int16x8_t vbslq_s16(
     uint16x8_t a,
     int16x8_t b,
     int16x8_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| int32x2_t vbsl_s32(
     uint32x2_t a,
     int32x2_t b,
     int32x2_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| int32x4_t vbslq_s32(
     uint32x4_t a,
     int32x4_t b,
     int32x4_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| int64x1_t vbsl_s64(
     uint64x1_t a,
     int64x1_t b,
     int64x1_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| int64x2_t vbslq_s64(
     uint64x2_t a,
     int64x2_t b,
     int64x2_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| uint8x8_t vbsl_u8(
     uint8x8_t a,
     uint8x8_t b,
     uint8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| uint8x16_t vbslq_u8(
     uint8x16_t a,
     uint8x16_t b,
     uint8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| uint16x4_t vbsl_u16(
     uint16x4_t a,
     uint16x4_t b,
     uint16x4_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| uint16x8_t vbslq_u16(
     uint16x8_t a,
     uint16x8_t b,
     uint16x8_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| uint32x2_t vbsl_u32(
     uint32x2_t a,
     uint32x2_t b,
     uint32x2_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| uint32x4_t vbslq_u32(
     uint32x4_t a,
     uint32x4_t b,
     uint32x4_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| uint64x1_t vbsl_u64(
     uint64x1_t a,
     uint64x1_t b,
     uint64x1_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| uint64x2_t vbslq_u64(
     uint64x2_t a,
     uint64x2_t b,
     uint64x2_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| poly64x1_t vbsl_p64(
     poly64x1_t a,
     poly64x1_t b,
     poly64x1_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `A32/A64` | +| poly64x2_t vbslq_p64(
     poly64x2_t a,
     poly64x2_t b,
     poly64x2_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `A32/A64` | +| float32x2_t vbsl_f32(
     uint32x2_t a,
     float32x2_t b,
     float32x2_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| float32x4_t vbslq_f32(
     uint32x4_t a,
     float32x4_t b,
     float32x4_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| poly8x8_t vbsl_p8(
     uint8x8_t a,
     poly8x8_t b,
     poly8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| poly8x16_t vbslq_p8(
     uint8x16_t a,
     poly8x16_t b,
     poly8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| poly16x4_t vbsl_p16(
     uint16x4_t a,
     poly16x4_t b,
     poly16x4_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| poly16x8_t vbslq_p16(
     uint16x8_t a,
     poly16x8_t b,
     poly16x8_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | +| float64x1_t vbsl_f64(
     uint64x1_t a,
     float64x1_t b,
     float64x1_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `A64` | +| float64x2_t vbslq_f64(
     uint64x2_t a,
     float64x2_t b,
     float64x2_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `A64` | +| mfloat8x8_t vbsl_mf8(
     uint8x8_t a,
     mfloat8x8_t b,
     mfloat8x8_t c)
| `a -> Vd.8B`
`b -> Vn.8B`
`c -> Vm.8B` | `BSL Vd.8B,Vn.8B,Vm.8B` | `Vd.8B -> result` | `v7/A32/A64` | +| mfloat8x16_t vbslq_mf8(
     uint8x16_t a,
     mfloat8x16_t b,
     mfloat8x16_t c)
| `a -> Vd.16B`
`b -> Vn.16B`
`c -> Vm.16B` | `BSL Vd.16B,Vn.16B,Vm.16B` | `Vd.16B -> result` | `v7/A32/A64` | ### Vector manipulation @@ -4707,6 +4709,10 @@ The intrinsics in this section are guarded by the macro ``__ARM_NEON``. | poly8x16_t vluti2_laneq_p8(
     poly8x8_t vn,
     uint8x16_t vm,
     const int index)
| `vn -> Vn.16B`
`vm -> Vm`
`0 <= index <= 3` | `LUTI2 Vd.16B, {Vn.16B}, Vm[index]` | `Vd.16B -> result` | `A64` | | poly8x16_t vluti2q_lane_p8(
     poly8x16_t vn,
     uint8x8_t vm,
     const int index)
| `vn -> Vn.16B`
`vm -> Vm`
`0 <= index <= 1` | `LUTI2 Vd.16B, {Vn.16B}, Vm[index]` | `Vd.16B -> result` | `A64` | | poly8x16_t vluti2q_laneq_p8(
     poly8x16_t vn,
     uint8x16_t vm,
     const int index)
| `vn -> Vn.16B`
`vm -> Vm`
`0 <= index <= 3` | `LUTI2 Vd.16B, {Vn.16B}, Vm[index]` | `Vd.16B -> result` | `A64` | +| mfloat8x16_t vluti2_lane_mf8(
     mfloat8x8_t vn,
     uint8x8_t vm,
     const int index)
| `vn -> Vn.16B`
`vm -> Vm`
`0 <= index <= 1` | `LUTI2 Vd.16B, {Vn.16B}, Vm[index]` | `Vd.16B -> result` | `A64` | +| mfloat8x16_t vluti2_laneq_mf8(
     mfloat8x8_t vn,
     uint8x16_t vm,
     const int index)
| `vn -> Vn.16B`
`vm -> Vm`
`0 <= index <= 3` | `LUTI2 Vd.16B, {Vn.16B}, Vm[index]` | `Vd.16B -> result` | `A64` | +| mfloat8x16_t vluti2q_lane_mf8(
     mfloat8x16_t vn,
     uint8x8_t vm,
     const int index)
| `vn -> Vn.16B`
`vm -> Vm`
`0 <= index <= 1` | `LUTI2 Vd.16B, {Vn.16B}, Vm[index]` | `Vd.16B -> result` | `A64` | +| mfloat8x16_t vluti2q_laneq_mf8(
     mfloat8x16_t vn,
     uint8x16_t vm,
     const int index)
| `vn -> Vn.16B`
`vm -> Vm`
`0 <= index <= 3` | `LUTI2 Vd.16B, {Vn.16B}, Vm[index]` | `Vd.16B -> result` | `A64` | | uint16x8_t vluti2_lane_u16(
     uint16x4_t vn,
     uint8x8_t vm,
     const int index)
| `vn -> Vn.8H`
`vm -> Vm`
`0 <= index <= 3` | `LUTI2 Vd.8H, {Vn.8H}, Vm[index]` | `Vd.8H -> result` | `A64` | | uint16x8_t vluti2_laneq_u16(
     uint16x4_t vn,
     uint8x16_t vm,
     const int index)
| `vn -> Vn.8H`
`vm -> Vm`
`0 <= index <= 7` | `LUTI2 Vd.8H, {Vn.8H}, Vm[index]` | `Vd.8H -> result` | `A64` | | uint16x8_t vluti2q_lane_u16(
     uint16x8_t vn,
     uint8x8_t vm,
     const int index)
| `vn -> Vn.8H`
`vm -> Vm`
`0 <= index <= 3` | `LUTI2 Vd.8H, {Vn.8H}, Vm[index]` | `Vd.8H -> result` | `A64` | @@ -4738,6 +4744,8 @@ The intrinsics in this section are guarded by the macro ``__ARM_NEON``. | int8x16_t vluti4q_laneq_s8(
     int8x16_t vn,
     uint8x16_t vm,
     const int index)
| `vn -> Vn.16B`
`vm -> Vm`
`0 <= index <= 1` | `LUTI4 Vd.16B, {Vn.16B}, Vm[index]` | `Vd.16B -> result` | `A64` | | poly8x16_t vluti4q_lane_p8(
     poly8x16_t vn,
     uint8x8_t vm,
     const int index)
| `vn -> Vn.16B`
`vm -> Vm`
`0 <= index <= 0` | `LUTI4 Vd.16B, {Vn.16B}, Vm[index]` | `Vd.16B -> result` | `A64` | | poly8x16_t vluti4q_laneq_p8(
     poly8x16_t vn,
     uint8x16_t vm,
     const int index)
| `vn -> Vn.16B`
`vm -> Vm`
`0 <= index <= 1` | `LUTI4 Vd.16B, {Vn.16B}, Vm[index]` | `Vd.16B -> result` | `A64` | +| mfloat8x16_t vluti4q_lane_mf8(
     mfloat8x16_t vn,
     uint8x8_t vm,
     const int index)
| `vn -> Vn.16B`
`vm -> Vm`
`0 <= index <= 0` | `LUTI4 Vd.16B, {Vn.16B}, Vm[index]` | `Vd.16B -> result` | `A64` | +| mfloat8x16_t vluti4q_laneq_mf8(
     mfloat8x16_t vn,
     uint8x16_t vm,
     const int index)
| `vn -> Vn.16B`
`vm -> Vm`
`0 <= index <= 1` | `LUTI4 Vd.16B, {Vn.16B}, Vm[index]` | `Vd.16B -> result` | `A64` | | uint16x8_t vluti4q_lane_u16_x2(
     uint16x8x2_t vn,
     uint8x8_t vm,
     const int index)
| `vn.val[0] -> Vn1.8H`
`vn.val[1] -> Vn2.8H`
`vm -> Vm`
`0 <= index <= 1` | `LUTI4 Vd.8H, {Vn1.8H, Vn2.8H}, Vm[index]` | `Vd.8H -> result` | `A64` | | uint16x8_t vluti4q_laneq_u16_x2(
     uint16x8x2_t vn,
     uint8x16_t vm,
     const int index)
| `vn.val[0] -> Vn1.8H`
`vn.val[1] -> Vn2.8H`
`vm -> Vm`
`0 <= index <= 3` | `LUTI4 Vd.8H, {Vn1.8H, Vn2.8H}, Vm[index]` | `Vd.8H -> result` | `A64` | | int16x8_t vluti4q_lane_s16_x2(
     int16x8x2_t vn,
     uint8x8_t vm,
     const int index)
| `vn.val[0] -> Vn1.8H`
`vn.val[1] -> Vn2.8H`
`vm -> Vm`
`0 <= index <= 1` | `LUTI4 Vd.8H, {Vn1.8H, Vn2.8H}, Vm[index]` | `Vd.8H -> result` | `A64` | diff --git a/tools/intrinsic_db/advsimd.csv b/tools/intrinsic_db/advsimd.csv index ec88903a..a800d90d 100644 --- a/tools/intrinsic_db/advsimd.csv +++ b/tools/intrinsic_db/advsimd.csv @@ -1818,6 +1818,8 @@ poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c) a -> Vd.8B;b -> Vn poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c) a -> Vd.16B;b -> Vn.16B;c -> Vm.16B BSL Vd.16B,Vn.16B,Vm.16B Vd.16B -> result v7/A32/A64 float64x1_t vbsl_f64(uint64x1_t a, float64x1_t b, float64x1_t c) a -> Vd.8B;b -> Vn.8B;c -> Vm.8B BSL Vd.8B,Vn.8B,Vm.8B Vd.8B -> result A64 float64x2_t vbslq_f64(uint64x2_t a, float64x2_t b, float64x2_t c) a -> Vd.16B;b -> Vn.16B;c -> Vm.16B BSL Vd.16B,Vn.16B,Vm.16B Vd.16B -> result A64 +mfloat8x8_t vbsl_mf8(uint8x8_t a, mfloat8x8_t b, mfloat8x8_t c) a -> Vd.8B;b -> Vn.8B;c -> Vm.8B BSL Vd.8B,Vn.8B,Vm.8B Vd.8B -> result v7/A32/A64 +mfloat8x16_t vbslq_mf8(uint8x16_t a, mfloat8x16_t b, mfloat8x16_t c) a -> Vd.16B;b -> Vn.16B;c -> Vm.16B BSL Vd.16B,Vn.16B,Vm.16B Vd.16B -> result v7/A32/A64 int8x8_t vcopy_lane_s8(int8x8_t a, __builtin_constant_p(lane1), int8x8_t b, __builtin_constant_p(lane2)) a -> Vd.8B;0 <= lane1 <= 7;b -> Vn.8B;0 <= lane2 <= 7 INS Vd.B[lane1],Vn.B[lane2] Vd.8B -> result A64 int8x16_t vcopyq_lane_s8(int8x16_t a, __builtin_constant_p(lane1), int8x8_t b, __builtin_constant_p(lane2)) a -> Vd.16B;0 <= lane1 <= 15;b -> Vn.8B;0 <= lane2 <= 7 INS Vd.B[lane1],Vn.B[lane2] Vd.16B -> result A64 int16x4_t vcopy_lane_s16(int16x4_t a, __builtin_constant_p(lane1), int16x4_t b, __builtin_constant_p(lane2)) a -> Vd.4H;0 <= lane1 <= 3;b -> Vn.4H;0 <= lane2 <= 3 INS Vd.H[lane1],Vn.H[lane2] Vd.4H -> result A64 @@ -3926,6 +3928,11 @@ poly8x16_t vluti2_laneq_p8(poly8x8_t vn, uint8x16_t vm, __builtin_constant_p(ind poly8x16_t vluti2q_lane_p8(poly8x16_t vn, uint8x8_t vm, __builtin_constant_p(index)) vn -> Vn.16B;vm -> Vm;0 <= index <= 1 LUTI2 Vd.16B, {Vn.16B}, Vm[index] Vd.16B -> result A64 poly8x16_t vluti2q_laneq_p8(poly8x16_t vn, uint8x16_t vm, __builtin_constant_p(index)) vn -> Vn.16B;vm -> Vm;0 <= index <= 3 LUTI2 Vd.16B, {Vn.16B}, Vm[index] Vd.16B -> result A64 +mfloat8x16_t vluti2_lane_mf8(mfloat8x8_t vn, uint8x8_t vm, __builtin_constant_p(index)) vn -> Vn.16B;vm -> Vm;0 <= index <= 1 LUTI2 Vd.16B, {Vn.16B}, Vm[index] Vd.16B -> result A64 +mfloat8x16_t vluti2_laneq_mf8(mfloat8x8_t vn, uint8x16_t vm, __builtin_constant_p(index)) vn -> Vn.16B;vm -> Vm;0 <= index <= 3 LUTI2 Vd.16B, {Vn.16B}, Vm[index] Vd.16B -> result A64 +mfloat8x16_t vluti2q_lane_mf8(mfloat8x16_t vn, uint8x8_t vm, __builtin_constant_p(index)) vn -> Vn.16B;vm -> Vm;0 <= index <= 1 LUTI2 Vd.16B, {Vn.16B}, Vm[index] Vd.16B -> result A64 +mfloat8x16_t vluti2q_laneq_mf8(mfloat8x16_t vn, uint8x16_t vm, __builtin_constant_p(index)) vn -> Vn.16B;vm -> Vm;0 <= index <= 3 LUTI2 Vd.16B, {Vn.16B}, Vm[index] Vd.16B -> result A64 + uint16x8_t vluti2_lane_u16(uint16x4_t vn, uint8x8_t vm, __builtin_constant_p(index)) vn -> Vn.8H;vm -> Vm;0 <= index <= 3 LUTI2 Vd.8H, {Vn.8H}, Vm[index] Vd.8H -> result A64 uint16x8_t vluti2_laneq_u16(uint16x4_t vn, uint8x16_t vm, __builtin_constant_p(index)) vn -> Vn.8H;vm -> Vm;0 <= index <= 7 LUTI2 Vd.8H, {Vn.8H}, Vm[index] Vd.8H -> result A64 uint16x8_t vluti2q_lane_u16(uint16x8_t vn, uint8x8_t vm, __builtin_constant_p(index)) vn -> Vn.8H;vm -> Vm;0 <= index <= 3 LUTI2 Vd.8H, {Vn.8H}, Vm[index] Vd.8H -> result A64 @@ -3960,6 +3967,9 @@ int8x16_t vluti4q_laneq_s8(int8x16_t vn, uint8x16_t vm, __builtin_constant_p(ind poly8x16_t vluti4q_lane_p8(poly8x16_t vn, uint8x8_t vm, __builtin_constant_p(index)) vn -> Vn.16B;vm -> Vm;0 <= index <= 0 LUTI4 Vd.16B, {Vn.16B}, Vm[index] Vd.16B -> result A64 poly8x16_t vluti4q_laneq_p8(poly8x16_t vn, uint8x16_t vm, __builtin_constant_p(index)) vn -> Vn.16B;vm -> Vm;0 <= index <= 1 LUTI4 Vd.16B, {Vn.16B}, Vm[index] Vd.16B -> result A64 +mfloat8x16_t vluti4q_lane_mf8(mfloat8x16_t vn, uint8x8_t vm, __builtin_constant_p(index)) vn -> Vn.16B;vm -> Vm;0 <= index <= 0 LUTI4 Vd.16B, {Vn.16B}, Vm[index] Vd.16B -> result A64 +mfloat8x16_t vluti4q_laneq_mf8(mfloat8x16_t vn, uint8x16_t vm, __builtin_constant_p(index)) vn -> Vn.16B;vm -> Vm;0 <= index <= 1 LUTI4 Vd.16B, {Vn.16B}, Vm[index] Vd.16B -> result A64 + uint16x8_t vluti4q_lane_u16_x2(uint16x8x2_t vn, uint8x8_t vm, __builtin_constant_p(index)) vn.val[0] -> Vn1.8H;vn.val[1] -> Vn2.8H;vm -> Vm;0 <= index <= 1 LUTI4 Vd.8H, {Vn1.8H, Vn2.8H}, Vm[index] Vd.8H -> result A64 uint16x8_t vluti4q_laneq_u16_x2(uint16x8x2_t vn, uint8x16_t vm, __builtin_constant_p(index)) vn.val[0] -> Vn1.8H;vn.val[1] -> Vn2.8H;vm -> Vm;0 <= index <= 3 LUTI4 Vd.8H, {Vn1.8H, Vn2.8H}, Vm[index] Vd.8H -> result A64 diff --git a/tools/intrinsic_db/advsimd_classification.csv b/tools/intrinsic_db/advsimd_classification.csv index ddfe70ed..e0e79f25 100644 --- a/tools/intrinsic_db/advsimd_classification.csv +++ b/tools/intrinsic_db/advsimd_classification.csv @@ -1817,6 +1817,8 @@ vbsl_p16 Bit manipulation|Bitwise select vbslq_p16 Bit manipulation|Bitwise select vbsl_f64 Bit manipulation|Bitwise select vbslq_f64 Bit manipulation|Bitwise select +vbsl_mf8 Bit manipulation|Bitwise select +vbslq_mf8 Bit manipulation|Bitwise select vcopy_lane_s8 Vector manipulation|Copy vector lane vcopyq_lane_s8 Vector manipulation|Copy vector lane vcopy_lane_s16 Vector manipulation|Copy vector lane @@ -4600,6 +4602,10 @@ vluti2_lane_p8 Table lookup|Lookup table read with 2-bit indices vluti2_laneq_p8 Table lookup|Lookup table read with 2-bit indices vluti2q_lane_p8 Table lookup|Lookup table read with 2-bit indices vluti2q_laneq_p8 Table lookup|Lookup table read with 2-bit indices +vluti2_lane_mf8 Table lookup|Lookup table read with 2-bit indices +vluti2_laneq_mf8 Table lookup|Lookup table read with 2-bit indices +vluti2q_lane_mf8 Table lookup|Lookup table read with 2-bit indices +vluti2q_laneq_mf8 Table lookup|Lookup table read with 2-bit indices vluti2_lane_u16 Table lookup|Lookup table read with 2-bit indices vluti2_laneq_u16 Table lookup|Lookup table read with 2-bit indices vluti2q_lane_u16 Table lookup|Lookup table read with 2-bit indices @@ -4623,6 +4629,7 @@ vluti2q_laneq_p16 Table lookup|Lookup table read with 2-bit indices vluti4q_laneq_u8 Table lookup|Lookup table read with 4-bit indices vluti4q_laneq_s8 Table lookup|Lookup table read with 4-bit indices vluti4q_laneq_p8 Table lookup|Lookup table read with 4-bit indices +vluti4q_laneq_mf8 Table lookup|Lookup table read with 4-bit indices vluti4q_laneq_u16_x2 Table lookup|Lookup table read with 4-bit indices vluti4q_laneq_s16_x2 Table lookup|Lookup table read with 4-bit indices vluti4q_laneq_f16_x2 Table lookup|Lookup table read with 4-bit indices @@ -4636,6 +4643,7 @@ vluti4q_lane_p16_x2 Table lookup|Lookup table read with 4-bit indices vluti4q_lane_u8 Table lookup|Lookup table read with 4-bit indices vluti4q_lane_s8 Table lookup|Lookup table read with 4-bit indices vluti4q_lane_p8 Table lookup|Lookup table read with 4-bit indices +vluti4q_lane_mf8 Table lookup|Lookup table read with 4-bit indices vcvt1_bf16_mf8_fpm Data type conversion|Conversions vcvt1_low_bf16_mf8_fpm Data type conversion|Conversions vcvt2_bf16_mf8_fpm Data type conversion|Conversions