From 27c61991aa8fac9b16b1b50c3e49acd3547da866 Mon Sep 17 00:00:00 2001 From: Jamie Cunliffe Date: Mon, 25 Oct 2021 12:52:52 +0100 Subject: [PATCH] Correct the vqrdmlah intrinsics. These intrinsics were added in 8.1 and should emit a sqrdmlah, however they currently emit sqrdmulh followed by a sqadd. LLVM doesn't convert this into sqrdmlah without enabling the rdm feature. --- .../core_arch/src/aarch64/neon/generated.rs | 274 ++++++++++++++++-- .../src/arm_shared/neon/generated.rs | 256 ---------------- crates/stdarch-gen/neon.spec | 21 +- crates/stdarch-gen/src/main.rs | 5 + 4 files changed, 274 insertions(+), 282 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 77deca552a..194695c11c 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -9639,58 +9639,176 @@ pub unsafe fn vqrdmulhs_laneq_s32(a: i32, b: int32x4_t) -> i32 /// Signed saturating rounding doubling multiply accumulate returning high half #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(sqrdmulh))] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah))] +pub unsafe fn vqrdmlah_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t { + vqadd_s16(a, vqrdmulh_s16(b, c)) +} + +/// Signed saturating rounding doubling multiply accumulate returning high half +#[inline] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah))] +pub unsafe fn vqrdmlahq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t { + vqaddq_s16(a, vqrdmulhq_s16(b, c)) +} + +/// Signed saturating rounding doubling multiply accumulate returning high half +#[inline] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah))] +pub unsafe fn vqrdmlah_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t { + vqadd_s32(a, vqrdmulh_s32(b, c)) +} + +/// Signed saturating rounding doubling multiply accumulate returning high half +#[inline] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah))] +pub unsafe fn vqrdmlahq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t { + vqaddq_s32(a, vqrdmulhq_s32(b, c)) +} + +/// Signed saturating rounding doubling multiply accumulate returning high half +#[inline] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah))] pub unsafe fn vqrdmlahh_s16(a: i16, b: i16, c: i16) -> i16 { - vqaddh_s16(a, vqrdmulhh_s16(b, c)) + let a: int16x4_t = vdup_n_s16(a); + let b: int16x4_t = vdup_n_s16(b); + let c: int16x4_t = vdup_n_s16(c); + simd_extract(vqrdmlah_s16(a, b, c), 0) } /// Signed saturating rounding doubling multiply accumulate returning high half #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(sqrdmulh))] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah))] pub unsafe fn vqrdmlahs_s32(a: i32, b: i32, c: i32) -> i32 { - vqadds_s32(a, vqrdmulhs_s32(b, c)) + let a: int32x2_t = vdup_n_s32(a); + let b: int32x2_t = vdup_n_s32(b); + let c: int32x2_t = vdup_n_s32(c); + simd_extract(vqrdmlah_s32(a, b, c), 0) } /// Signed saturating rounding doubling multiply accumulate returning high half #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(sqrdmulh, LANE = 1))] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqrdmlah_lane_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t { + static_assert_imm2!(LANE); + vqadd_s16(a, vqrdmulh_lane_s16::(b, c)) +} + +/// Signed saturating rounding doubling multiply accumulate returning high half +#[inline] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqrdmlah_laneq_s16(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t { + static_assert_imm3!(LANE); + vqadd_s16(a, vqrdmulh_laneq_s16::(b, c)) +} + +/// Signed saturating rounding doubling multiply accumulate returning high half +#[inline] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqrdmlahq_lane_s16(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t { + static_assert_imm2!(LANE); + vqaddq_s16(a, vqrdmulhq_lane_s16::(b, c)) +} + +/// Signed saturating rounding doubling multiply accumulate returning high half +#[inline] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqrdmlahq_laneq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t { + static_assert_imm3!(LANE); + vqaddq_s16(a, vqrdmulhq_laneq_s16::(b, c)) +} + +/// Signed saturating rounding doubling multiply accumulate returning high half +#[inline] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqrdmlah_lane_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t { + static_assert_imm1!(LANE); + vqadd_s32(a, vqrdmulh_lane_s32::(b, c)) +} + +/// Signed saturating rounding doubling multiply accumulate returning high half +#[inline] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqrdmlah_laneq_s32(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t { + static_assert_imm2!(LANE); + vqadd_s32(a, vqrdmulh_laneq_s32::(b, c)) +} + +/// Signed saturating rounding doubling multiply accumulate returning high half +#[inline] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqrdmlahq_lane_s32(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t { + static_assert_imm1!(LANE); + vqaddq_s32(a, vqrdmulhq_lane_s32::(b, c)) +} + +/// Signed saturating rounding doubling multiply accumulate returning high half +#[inline] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vqrdmlahq_laneq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t { + static_assert_imm2!(LANE); + vqaddq_s32(a, vqrdmulhq_laneq_s32::(b, c)) +} + +/// Signed saturating rounding doubling multiply accumulate returning high half +#[inline] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))] #[rustc_legacy_const_generics(3)] pub unsafe fn vqrdmlahh_lane_s16(a: i16, b: i16, c: int16x4_t) -> i16 { static_assert_imm2!(LANE); - vqaddh_s16(a, vqrdmulhh_lane_s16::(b, c)) + vqrdmlahh_s16(a, b, simd_extract(c, LANE as u32)) } /// Signed saturating rounding doubling multiply accumulate returning high half #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(sqrdmulh, LANE = 1))] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))] #[rustc_legacy_const_generics(3)] pub unsafe fn vqrdmlahh_laneq_s16(a: i16, b: i16, c: int16x8_t) -> i16 { static_assert_imm3!(LANE); - vqaddh_s16(a, vqrdmulhh_laneq_s16::(b, c)) + vqrdmlahh_s16(a, b, simd_extract(c, LANE as u32)) } /// Signed saturating rounding doubling multiply accumulate returning high half #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(sqrdmulh, LANE = 1))] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))] #[rustc_legacy_const_generics(3)] pub unsafe fn vqrdmlahs_lane_s32(a: i32, b: i32, c: int32x2_t) -> i32 { static_assert_imm1!(LANE); - vqadds_s32(a, vqrdmulhs_lane_s32::(b, c)) + vqrdmlahs_s32(a, b, simd_extract(c, LANE as u32)) } /// Signed saturating rounding doubling multiply accumulate returning high half #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(sqrdmulh, LANE = 1))] +#[target_feature(enable = "rdm")] +#[cfg_attr(test, assert_instr(sqrdmlah, LANE = 1))] #[rustc_legacy_const_generics(3)] pub unsafe fn vqrdmlahs_laneq_s32(a: i32, b: i32, c: int32x4_t) -> i32 { static_assert_imm2!(LANE); - vqadds_s32(a, vqrdmulhs_laneq_s32::(b, c)) + vqrdmlahs_s32(a, b, simd_extract(c, LANE as u32)) } /// Signed saturating rounding doubling multiply subtract returning high half @@ -20709,6 +20827,46 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vqrdmlah_s16() { + let a: i16x4 = i16x4::new(1, 1, 1, 1); + let b: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF); + let c: i16x4 = i16x4::new(2, 2, 2, 2); + let e: i16x4 = i16x4::new(3, 3, 3, 3); + let r: i16x4 = transmute(vqrdmlah_s16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqrdmlahq_s16() { + let a: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1); + let b: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF); + let c: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2); + let e: i16x8 = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3); + let r: i16x8 = transmute(vqrdmlahq_s16(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqrdmlah_s32() { + let a: i32x2 = i32x2::new(1, 1); + let b: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF); + let c: i32x2 = i32x2::new(2, 2); + let e: i32x2 = i32x2::new(3, 3); + let r: i32x2 = transmute(vqrdmlah_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqrdmlahq_s32() { + let a: i32x4 = i32x4::new(1, 1, 1, 1); + let b: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF); + let c: i32x4 = i32x4::new(2, 2, 2, 2); + let e: i32x4 = i32x4::new(3, 3, 3, 3); + let r: i32x4 = transmute(vqrdmlahq_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vqrdmlahh_s16() { let a: i16 = 1; @@ -20729,6 +20887,86 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vqrdmlah_lane_s16() { + let a: i16x4 = i16x4::new(1, 1, 1, 1); + let b: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF); + let c: i16x4 = i16x4::new(0, 2, 0, 0); + let e: i16x4 = i16x4::new(3, 3, 3, 3); + let r: i16x4 = transmute(vqrdmlah_lane_s16::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqrdmlah_laneq_s16() { + let a: i16x4 = i16x4::new(1, 1, 1, 1); + let b: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF); + let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0); + let e: i16x4 = i16x4::new(3, 3, 3, 3); + let r: i16x4 = transmute(vqrdmlah_laneq_s16::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqrdmlahq_lane_s16() { + let a: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1); + let b: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF); + let c: i16x4 = i16x4::new(0, 2, 0, 0); + let e: i16x8 = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3); + let r: i16x8 = transmute(vqrdmlahq_lane_s16::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqrdmlahq_laneq_s16() { + let a: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1); + let b: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF); + let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0); + let e: i16x8 = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3); + let r: i16x8 = transmute(vqrdmlahq_laneq_s16::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqrdmlah_lane_s32() { + let a: i32x2 = i32x2::new(1, 1); + let b: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF); + let c: i32x2 = i32x2::new(0, 2); + let e: i32x2 = i32x2::new(3, 3); + let r: i32x2 = transmute(vqrdmlah_lane_s32::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqrdmlah_laneq_s32() { + let a: i32x2 = i32x2::new(1, 1); + let b: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF); + let c: i32x4 = i32x4::new(0, 2, 0, 0); + let e: i32x2 = i32x2::new(3, 3); + let r: i32x2 = transmute(vqrdmlah_laneq_s32::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqrdmlahq_lane_s32() { + let a: i32x4 = i32x4::new(1, 1, 1, 1); + let b: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF); + let c: i32x2 = i32x2::new(0, 2); + let e: i32x4 = i32x4::new(3, 3, 3, 3); + let r: i32x4 = transmute(vqrdmlahq_lane_s32::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vqrdmlahq_laneq_s32() { + let a: i32x4 = i32x4::new(1, 1, 1, 1); + let b: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF); + let c: i32x4 = i32x4::new(0, 2, 0, 0); + let e: i32x4 = i32x4::new(3, 3, 3, 3); + let r: i32x4 = transmute(vqrdmlahq_laneq_s32::<1>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vqrdmlahh_lane_s16() { let a: i16 = 1; diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 2e98d473bf..bd78a973e8 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -16376,142 +16376,6 @@ pub unsafe fn vqrdmulhq_laneq_s32(a: int32x4_t, b: int32x4_t) - vqrdmulhq_s32(a, b) } -/// Signed saturating rounding doubling multiply accumulate returning high half -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))] -pub unsafe fn vqrdmlah_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t { - vqadd_s16(a, vqrdmulh_s16(b, c)) -} - -/// Signed saturating rounding doubling multiply accumulate returning high half -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))] -pub unsafe fn vqrdmlahq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t { - vqaddq_s16(a, vqrdmulhq_s16(b, c)) -} - -/// Signed saturating rounding doubling multiply accumulate returning high half -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))] -pub unsafe fn vqrdmlah_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t { - vqadd_s32(a, vqrdmulh_s32(b, c)) -} - -/// Signed saturating rounding doubling multiply accumulate returning high half -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh))] -pub unsafe fn vqrdmlahq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t { - vqaddq_s32(a, vqrdmulhq_s32(b, c)) -} - -/// Signed saturating rounding doubling multiply accumulate returning high half -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn vqrdmlah_lane_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t { - static_assert_imm2!(LANE); - vqadd_s16(a, vqrdmulh_lane_s16::(b, c)) -} - -/// Signed saturating rounding doubling multiply accumulate returning high half -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn vqrdmlah_laneq_s16(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t { - static_assert_imm3!(LANE); - vqadd_s16(a, vqrdmulh_laneq_s16::(b, c)) -} - -/// Signed saturating rounding doubling multiply accumulate returning high half -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn vqrdmlahq_lane_s16(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t { - static_assert_imm2!(LANE); - vqaddq_s16(a, vqrdmulhq_lane_s16::(b, c)) -} - -/// Signed saturating rounding doubling multiply accumulate returning high half -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn vqrdmlahq_laneq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t { - static_assert_imm3!(LANE); - vqaddq_s16(a, vqrdmulhq_laneq_s16::(b, c)) -} - -/// Signed saturating rounding doubling multiply accumulate returning high half -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn vqrdmlah_lane_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t { - static_assert_imm1!(LANE); - vqadd_s32(a, vqrdmulh_lane_s32::(b, c)) -} - -/// Signed saturating rounding doubling multiply accumulate returning high half -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn vqrdmlah_laneq_s32(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t { - static_assert_imm2!(LANE); - vqadd_s32(a, vqrdmulh_laneq_s32::(b, c)) -} - -/// Signed saturating rounding doubling multiply accumulate returning high half -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn vqrdmlahq_lane_s32(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t { - static_assert_imm1!(LANE); - vqaddq_s32(a, vqrdmulhq_lane_s32::(b, c)) -} - -/// Signed saturating rounding doubling multiply accumulate returning high half -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrdmulh, LANE = 1))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrdmulh, LANE = 1))] -#[rustc_legacy_const_generics(3)] -pub unsafe fn vqrdmlahq_laneq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t { - static_assert_imm2!(LANE); - vqaddq_s32(a, vqrdmulhq_laneq_s32::(b, c)) -} - /// Signed saturating rounding doubling multiply subtract returning high half #[inline] #[target_feature(enable = "neon")] @@ -33521,126 +33385,6 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] - unsafe fn test_vqrdmlah_s16() { - let a: i16x4 = i16x4::new(1, 1, 1, 1); - let b: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF); - let c: i16x4 = i16x4::new(2, 2, 2, 2); - let e: i16x4 = i16x4::new(3, 3, 3, 3); - let r: i16x4 = transmute(vqrdmlah_s16(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon")] - unsafe fn test_vqrdmlahq_s16() { - let a: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1); - let b: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF); - let c: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2); - let e: i16x8 = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3); - let r: i16x8 = transmute(vqrdmlahq_s16(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon")] - unsafe fn test_vqrdmlah_s32() { - let a: i32x2 = i32x2::new(1, 1); - let b: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF); - let c: i32x2 = i32x2::new(2, 2); - let e: i32x2 = i32x2::new(3, 3); - let r: i32x2 = transmute(vqrdmlah_s32(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon")] - unsafe fn test_vqrdmlahq_s32() { - let a: i32x4 = i32x4::new(1, 1, 1, 1); - let b: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF); - let c: i32x4 = i32x4::new(2, 2, 2, 2); - let e: i32x4 = i32x4::new(3, 3, 3, 3); - let r: i32x4 = transmute(vqrdmlahq_s32(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon")] - unsafe fn test_vqrdmlah_lane_s16() { - let a: i16x4 = i16x4::new(1, 1, 1, 1); - let b: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF); - let c: i16x4 = i16x4::new(0, 2, 0, 0); - let e: i16x4 = i16x4::new(3, 3, 3, 3); - let r: i16x4 = transmute(vqrdmlah_lane_s16::<1>(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon")] - unsafe fn test_vqrdmlah_laneq_s16() { - let a: i16x4 = i16x4::new(1, 1, 1, 1); - let b: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF); - let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0); - let e: i16x4 = i16x4::new(3, 3, 3, 3); - let r: i16x4 = transmute(vqrdmlah_laneq_s16::<1>(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon")] - unsafe fn test_vqrdmlahq_lane_s16() { - let a: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1); - let b: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF); - let c: i16x4 = i16x4::new(0, 2, 0, 0); - let e: i16x8 = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3); - let r: i16x8 = transmute(vqrdmlahq_lane_s16::<1>(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon")] - unsafe fn test_vqrdmlahq_laneq_s16() { - let a: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1); - let b: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF, 0x7F_FF); - let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0); - let e: i16x8 = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3); - let r: i16x8 = transmute(vqrdmlahq_laneq_s16::<1>(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon")] - unsafe fn test_vqrdmlah_lane_s32() { - let a: i32x2 = i32x2::new(1, 1); - let b: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF); - let c: i32x2 = i32x2::new(0, 2); - let e: i32x2 = i32x2::new(3, 3); - let r: i32x2 = transmute(vqrdmlah_lane_s32::<1>(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon")] - unsafe fn test_vqrdmlah_laneq_s32() { - let a: i32x2 = i32x2::new(1, 1); - let b: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF); - let c: i32x4 = i32x4::new(0, 2, 0, 0); - let e: i32x2 = i32x2::new(3, 3); - let r: i32x2 = transmute(vqrdmlah_laneq_s32::<1>(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon")] - unsafe fn test_vqrdmlahq_lane_s32() { - let a: i32x4 = i32x4::new(1, 1, 1, 1); - let b: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF); - let c: i32x2 = i32x2::new(0, 2); - let e: i32x4 = i32x4::new(3, 3, 3, 3); - let r: i32x4 = transmute(vqrdmlahq_lane_s32::<1>(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - - #[simd_test(enable = "neon")] - unsafe fn test_vqrdmlahq_laneq_s32() { - let a: i32x4 = i32x4::new(1, 1, 1, 1); - let b: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x7F_FF_FF_FF); - let c: i32x4 = i32x4::new(0, 2, 0, 0); - let e: i32x4 = i32x4::new(3, 3, 3, 3); - let r: i32x4 = transmute(vqrdmlahq_laneq_s32::<1>(transmute(a), transmute(b), transmute(c))); - assert_eq!(r, e); - } - #[simd_test(enable = "neon")] unsafe fn test_vqrdmlsh_s16() { let a: i16x4 = i16x4::new(1, 1, 1, 1); diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 2e1e1b36ec..0dca59839b 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -5539,19 +5539,23 @@ b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX c = 2, 2, 2, 2, 2, 2, 2, 2 validate 3, 3, 3, 3, 3, 3, 3, 3 -aarch64 = sqrdmulh -arm = vqrdmulh +aarch64 = sqrdmlah +target = rdm generate int16x4_t, int16x8_t, int32x2_t, int32x4_t /// Signed saturating rounding doubling multiply accumulate returning high half name = vqrdmlah -multi_fn = vqadd-self-noext, a, {vqrdmulh-self-noext, b, c} +multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a +multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b +multi_fn = vdup_n-in_ntt-noext, c:in_ntt, c +multi_fn = simd_extract, {vqrdmlah-in_ntt-noext, a, b, c}, 0 a = 1 b = 1 c = 2 validate 1 -aarch64 = sqrdmulh +aarch64 = sqrdmlah +target = rdm generate i16, i32 /// Signed saturating rounding doubling multiply accumulate returning high half @@ -5566,8 +5570,8 @@ c = 0, 2, 0, 0, 0, 0, 0, 0 n = 1 validate 3, 3, 3, 3, 3, 3, 3, 3 -aarch64 = sqrdmulh -arm = vqrdmulh +aarch64 = sqrdmlah +target = rdm generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t @@ -5576,14 +5580,15 @@ name = vqrdmlah in2-lane-suffixes constn = LANE multi_fn = static_assert_imm-in2_exp_len-LANE -multi_fn = vqadd-self-noext, a, {vqrdmulh-in2lane-::, b, c} +multi_fn = vqrdmlah-self-noext, a, b, {simd_extract, c, LANE as u32} a = 1 b = 1 c = 0, 2, 0, 0, 0, 0, 0, 0 n = 1 validate 1 -aarch64 = sqrdmulh +aarch64 = sqrdmlah +target = rdm generate i16:i16:int16x4_t:i16, i16:i16:int16x8_t:i16, i32:i32:int32x2_t:i32, i32:i32:int32x4_t:i32 /// Signed saturating rounding doubling multiply subtract returning high half diff --git a/crates/stdarch-gen/src/main.rs b/crates/stdarch-gen/src/main.rs index 961ab9d27b..a33933ad97 100644 --- a/crates/stdarch-gen/src/main.rs +++ b/crates/stdarch-gen/src/main.rs @@ -464,6 +464,7 @@ enum TargetFeature { FCMA, Dotprod, I8MM, + RDM, } #[derive(Clone, Copy)] @@ -1067,6 +1068,7 @@ fn gen_aarch64( FCMA => "neon,fcma", Dotprod => "neon,dotprod", I8MM => "neon,i8mm", + RDM => "rdm", }; let current_fn = if let Some(current_fn) = current_fn.clone() { if link_aarch64.is_some() { @@ -1775,6 +1777,7 @@ fn gen_arm( FCMA => "neon,fcma", Dotprod => "neon,dotprod", I8MM => "neon,i8mm", + RDM => "rdm", }; let current_target_arm = match target { Default => "v7", @@ -1785,6 +1788,7 @@ fn gen_arm( FCMA => "v8", // v8.3a Dotprod => "v8", // v8.2a I8MM => "v8", // v8.6a + RDM => unreachable!(), }; let current_fn = if let Some(current_fn) = current_fn.clone() { if link_aarch64.is_some() || link_arm.is_some() { @@ -3169,6 +3173,7 @@ mod test { "fcma" => FCMA, "dotprod" => Dotprod, "i8mm" => I8MM, + "rdm" => RDM, _ => Default, }, _ => Default,