Skip to content

Commit

Permalink
Tweaks to the SME2 intrinsics
Browse files Browse the repository at this point in the history
This patch makes a few tweaks to the SME2 intrinsics:

*   The type suffix of the SME2 svread* intrinsics needs to be
    explicit (non-optional), since there is no merge input that
    can be used to infer it.

*   The zn argument to the svluti* intrinsics is a collection of
    2-bit and 4-bit quantities, so it doesn't have a natural element
    size or signedness.  It seems better to keep it as svuint8_t for
    all variants.

*   Because of that, there is no argument that implies the return
    type of the svluti* intrinsics, so the type suffix needs to be
    explicit.  Also, since the instruction performs a bag-of-bits
    lookup, it makes sense to have floating-point variants too.

*   The ZA slice forms of svread* and svwrite* are likewise
    bag-of-bits moves, so we can provide alternatives for all
    element types.

*   arm_neon.h shift-by-immediate instructions use an _n suffix
    to indicate that the shift amount is scalar.  arm_sve.h
    carried this across to the full/non-overloaded forms of SVE
    immediate shifts.  It seems worth doing the same here for
    consistency, and to protect against vector-vector forms
    being added in future.
  • Loading branch information
rsandifo-arm committed Nov 21, 2023
1 parent c65148b commit 386b81f
Showing 1 changed file with 81 additions and 66 deletions.
147 changes: 81 additions & 66 deletions main/acle.md
Original file line number Diff line number Diff line change
Expand Up @@ -9279,7 +9279,7 @@ ZA array vectors. The intrinsics model this in the following way:

``` c
// Reads 2 consecutive horizontal tile slices from ZA into multi-vector.
svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice)
svint8x2_t svread_hor_za8_s8_vg2(uint64_t tile, uint32_t slice)
__arm_streaming __arm_shared_za __arm_preserves_za;
```

Expand Down Expand Up @@ -11225,42 +11225,43 @@ Zero ZT0
Lookup table read with 2-bit and 4-bit indexes

``` c
// Variants are also available for _zt[_s8], _zt[_u16], _zt[_s16], _zt[_u32]
// and _zt[_s32]
svuint8_t svluti2_lane_zt[_u8](uint64_t zt, svuint8_t zn, uint64_t imm_idx)
__arm_streaming __arm_shared_za __arm_preserves_za;
// Variants are also available for _zt_u8, _zt_s16, _zt_u16, _zt_f16,
// _zt_bf16, _zt_s32, _zt_u32 and _zt_f32
svint8_t svluti2_lane_zt_s8(uint64_t zt, svuint8_t zn, uint64_t imm_idx)
__arm_streaming __arm_shared_zt __arm_preserves_zt;


// Variants are also available for _zt[_s8], _zt[_u16], _zt[_s16], _zt[_u32]
// and _zt[_s32]
svuint8x2_t svluti2_lane_zt[_u8]_x2(uint64_t zt, svuint8_t zn,
uint64_t imm_idx)
__arm_streaming __arm_shared_za __arm_preserves_za;
// Variants are also available for _zt_u8, _zt_s16, _zt_u16, _zt_f16,
// _zt_bf16, _zt_s32, _zt_u32 and _zt_f32
svint8x2_t svluti2_lane_zt_s8_x2(uint64_t zt, svuint8_t zn,
uint64_t imm_idx)
__arm_streaming __arm_shared_zt __arm_preserves_zt;


// Variants are also available for _zt[_s8], _zt[_u16], _zt[_s16], _zt[_u32]
// and _zt[_s32]
svuint8x4_t svluti2_lane_zt[_u8]_x4(uint64_t zt, svuint8_t zn,
uint64_t imm_idx)
__arm_streaming __arm_shared_za __arm_preserves_za;
// Variants are also available for _zt_u8, _zt_s16, _zt_u16, _zt_f16,
// _zt_bf16, _zt_s32, _zt_u32 and _zt_f32
svint8x4_t svluti2_lane_zt_s8_x4(uint64_t zt, svuint8_t zn,
uint64_t imm_idx)
__arm_streaming __arm_shared_zt __arm_preserves_zt;


// Variants are also available for _zt[_s8], _zt[_u16], _zt[_s16], _zt[_u32]
// and _zt[_s32]
svuint8_t svluti4_lane_zt[_u8](uint64_t zt, svuint8_t zn, uint64_t imm_idx)
__arm_streaming __arm_shared_za __arm_preserves_za;
// Variants are also available for _zt_u8, _zt_s16, _zt_u16, _zt_f16,
// _zt_bf16, _zt_s32, _zt_u32 and _zt_f32
svint8_t svluti4_lane_zt_s8(uint64_t zt, svuint8_t zn, uint64_t imm_idx)
__arm_streaming __arm_shared_zt __arm_preserves_zt;


// Variants are also available for _zt[_s8], _zt[_u16], _zt[_s16], _zt[_u32]
// and _zt[_s32]
svuint8x2_t svluti4_lane_zt[_u8]_x2(uint64_t zt, svuint8_t zn,
uint64_t imm_idx)
__arm_streaming __arm_shared_za __arm_preserves_za;
// Variants are also available for _zt_u8, _zt_s16, _zt_u16, _zt_f16,
// _zt_bf16, _zt_s32, _zt_u32 and _zt_f32
svint8x2_t svluti4_lane_zt_s8_x2(uint64_t zt, svuint8_t zn,
uint64_t imm_idx)
__arm_streaming __arm_shared_zt __arm_preserves_zt;


// Variants are also available for _zt[_s16], _zt[_u32] and _zt[_s32]
svuint16x4_t svluti4_lane_zt[_u16]_x4(uint64_t zt, svuint16_t zn,
uint64_t imm_idx)
// Variants are also available for _zt_u16, _zt_f16, _zt_bf16, _zt_s32,
// _zt_u32 and _zt_f32
svint16x4_t svluti4_lane_zt_s16_x4(uint64_t zt, svuint16_t zn,
uint64_t imm_idx)
__arm_streaming __arm_shared_za __arm_preserves_za;
```

Expand All @@ -11269,79 +11270,87 @@ Lookup table read with 2-bit and 4-bit indexes
Move multi-vectors to/from ZA

``` c
// Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64_[f64]
svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice)
// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svread_hor_za8_s8_vg2(uint64_t tile, uint32_t slice)
__arm_streaming __arm_shared_za __arm_preserves_za;


// Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64_[f64]
svint8x4_t svread_hor_za8[_s8]_vg4(uint64_t tile, uint32_t slice)
// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svread_hor_za8_s8_vg4(uint64_t tile, uint32_t slice)
__arm_streaming __arm_shared_za __arm_preserves_za;


// Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64_[f64]
svint8x2_t svread_ver_za8[_s8]_vg2(uint64_t tile, uint32_t slice)
// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svread_ver_za8_s8_vg2(uint64_t tile, uint32_t slice)
__arm_streaming __arm_shared_za __arm_preserves_za;


// Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64_[f64]
svint8x4_t svread_ver_za8[_s8]_vg4(uint64_t tile, uint32_t slice)
// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svread_ver_za8_s8_vg4(uint64_t tile, uint32_t slice)
__arm_streaming __arm_shared_za __arm_preserves_za;


// Variants are also available for _za64_u64 and _za64_f64
svint64x2_t svread_za64_s64_vg1x2(uint32_t slice)
// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x2_t svread_za8_s8_vg1x2(uint32_t slice)
__arm_streaming __arm_shared_za __arm_preserves_za;


// Variants are also available for _za64_u64 and _za64_f64
svint64x4_t svread_za64_s64_vg1x4(uint32_t slice)
// Variants are also available for _za8_u8, _za16_s16, _za16_u16,
// _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32,
// _za64_s64, _za64_u64 and _za64_f64
svint8x4_t svread_za8_s8_vg1x4(uint32_t slice)
__arm_streaming __arm_shared_za __arm_preserves_za;


// Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64_[f64]
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn)
__arm_streaming __arm_shared_za;


// Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64_[f64]
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_hor_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn)
__arm_streaming __arm_shared_za;


// Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64_[f64]
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_ver_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn)
__arm_streaming __arm_shared_za;


// Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64_[f64]
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_ver_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn)
__arm_streaming __arm_shared_za;


// Variants are also available for _za64[_u64] and _za64[_f64]
void svwrite_za64[_s64]_vg1x2(uint32_t slice, svint64x2_t zn)
// Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_za8[_s8]_vg1x2(uint32_t slice, svint8x2_t zn)
__arm_streaming __arm_shared_za;


// Variants are also available for _za64[_u64] and _za64[_f64]
void svwrite_za64[_s64]_vg1x4(uint32_t slice, svint64x4_t zn)
// Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16],
// _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32],
// _za64[_s64], _za64[_u64] and _za64[_f64]
void svwrite_za8[_s8]_vg1x4(uint32_t slice, svint8x4_t zn)
__arm_streaming __arm_shared_za;
```

Expand Down Expand Up @@ -11473,15 +11482,18 @@ Multi-vector saturating rounding shift right narrow

``` c
// Variants are also available for _u8[_u32_x4]
svint8_t svqrshr_s8[_s32_x4](svint32x4_t zn, uint64_t imm) __arm_streaming;
svint8_t svqrshr[_n]_s8[_s32_x4](svint32x4_t zn, uint64_t imm)
__arm_streaming;


// Variants are also available for _u16[_u32_x2]
svint16_t svqrshr_s16[_s32_x2](svint32x2_t zn, uint64_t imm) __arm_streaming;
svint16_t svqrshr[_n]_s16[_s32_x2](svint32x2_t zn, uint64_t imm)
__arm_streaming;


// Variants are also available for _u16[_u64_x4]
svint16_t svqrshr_s16[_s64_x4](svint64x4_t zn, uint64_t imm) __arm_streaming;
svint16_t svqrshr[_n]_s16[_s64_x4](svint64x4_t zn, uint64_t imm)
__arm_streaming;
```

#### SQRSHRN, UQRSHRN
Expand All @@ -11490,17 +11502,17 @@ Multi-vector saturating rounding shift right narrow and interleave

``` c
// Variants are also available for _u8[_u32_x4]
svint8_t svqrshrn_s8[_s32_x4](svint32x4_t zn, uint64_t imm)
svint8_t svqrshrn[_n]_s8[_s32_x4](svint32x4_t zn, uint64_t imm)
__arm_streaming;


// Variants are also available for _u16[_u32_x2]
svint16_t svqrshrn_s16[_s32_x2](svint32x2_t zn, uint64_t imm)
svint16_t svqrshrn[_n]_s16[_s32_x2](svint32x2_t zn, uint64_t imm)
__arm_streaming_compatible;


// Variants are also available for _u16[_u64_x4]
svint16_t svqrshrn_s16[_s64_x4](svint64x4_t zn, uint64_t imm)
svint16_t svqrshrn[_n]_s16[_s64_x4](svint64x4_t zn, uint64_t imm)
__arm_streaming;
```

Expand All @@ -11509,26 +11521,29 @@ Multi-vector saturating rounding shift right narrow and interleave
Multi-vector saturating rounding shift right unsigned narrow

``` c
svuint8_t svqrshru_u8[_s32_x4](svint32x4_t zn, uint64_t imm) __arm_streaming;
svuint8_t svqrshru[_n]_u8[_s32_x4](svint32x4_t zn, uint64_t imm)
__arm_streaming;


svuint16_t svqrshru_u16[_s32_x2](svint32x2_t zn, uint64_t imm) __arm_streaming;
svuint16_t svqrshru[_n]_u16[_s32_x2](svint32x2_t zn, uint64_t imm)
__arm_streaming;


svuint16_t svqrshru_u16[_s64_x4](svint64x4_t zn, uint64_t imm) __arm_streaming;
svuint16_t svqrshru[_n]_u16[_s64_x4](svint64x4_t zn, uint64_t imm)
__arm_streaming;
```

#### SQRSHRUN

Multi-vector saturating rounding shift right unsigned narrow and interleave

``` c
svuint16_t svqrshrun_u16[_s32_x2](svint32x2_t zn, uint64_t imm)
svuint16_t svqrshrun[_n]_u16[_s32_x2](svint32x2_t zn, uint64_t imm)
__arm_streaming_compatible;


// Variants are also available for _u16[_s64_x4]
svuint8_t svqrshrun_u8[_s32_x4](svint32x4_t zn, uint64_t imm)
svuint8_t svqrshrun[_n]_u8[_s32_x4](svint32x4_t zn, uint64_t imm)
__arm_streaming;
```

Expand Down

0 comments on commit 386b81f

Please sign in to comment.