From a1898f5ff0b4f68763f7c9b4b1d49062108eede9 Mon Sep 17 00:00:00 2001
From: SparrowLii <liyuan179@huawei.com>
Date: Tue, 9 Nov 2021 18:46:38 +0800
Subject: [PATCH 1/5] add vmmla vusmmla vsm4e vsm3 vrax1 vxar vsha512 vbcax
 veor3 neon instructions

---
 .../core_arch/src/aarch64/neon/generated.rs   | 434 +++++++++++++++---
 crates/core_arch/src/aarch64/neon/mod.rs      | 140 ++++++
 crates/core_arch/src/arm_shared/neon/mod.rs   |  86 ++++
 crates/core_arch/src/lib.rs                   |   3 +-
 crates/stdarch-gen/neon.spec                  | 189 +++++++-
 crates/stdarch-gen/src/main.rs                |  56 ++-
 crates/stdarch-verify/tests/arm.rs            |  26 ++
 7 files changed, 834 insertions(+), 100 deletions(-)

diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
index 194695c11c..efa637d1c3 100644
--- a/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/crates/core_arch/src/aarch64/neon/generated.rs
@@ -11,66 +11,106 @@ use stdarch_test::assert_instr;
 
 /// Three-way exclusive OR
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop))]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(eor3))]
 pub unsafe fn veor3q_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
-    simd_xor(simd_xor(a, b), c)
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3s.v16i8")]
+        fn veor3q_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t;
+    }
+    veor3q_s8_(a, b, c)
 }
 
 /// Three-way exclusive OR
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop))]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(eor3))]
 pub unsafe fn veor3q_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
-    simd_xor(simd_xor(a, b), c)
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3s.v8i16")]
+        fn veor3q_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t;
+    }
+    veor3q_s16_(a, b, c)
 }
 
 /// Three-way exclusive OR
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop))]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(eor3))]
 pub unsafe fn veor3q_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
-    simd_xor(simd_xor(a, b), c)
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3s.v4i32")]
+        fn veor3q_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t;
+    }
+    veor3q_s32_(a, b, c)
 }
 
 /// Three-way exclusive OR
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop))]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(eor3))]
 pub unsafe fn veor3q_s64(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t {
-    simd_xor(simd_xor(a, b), c)
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3s.v2i64")]
+        fn veor3q_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t;
+    }
+    veor3q_s64_(a, b, c)
 }
 
 /// Three-way exclusive OR
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop))]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(eor3))]
 pub unsafe fn veor3q_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
-    simd_xor(simd_xor(a, b), c)
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3u.v16i8")]
+        fn veor3q_u8_(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t;
+    }
+    veor3q_u8_(a, b, c)
 }
 
 /// Three-way exclusive OR
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop))]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(eor3))]
 pub unsafe fn veor3q_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
-    simd_xor(simd_xor(a, b), c)
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3u.v8i16")]
+        fn veor3q_u16_(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t;
+    }
+    veor3q_u16_(a, b, c)
 }
 
 /// Three-way exclusive OR
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop))]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(eor3))]
 pub unsafe fn veor3q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
-    simd_xor(simd_xor(a, b), c)
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3u.v4i32")]
+        fn veor3q_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t;
+    }
+    veor3q_u32_(a, b, c)
 }
 
 /// Three-way exclusive OR
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop))]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(eor3))]
 pub unsafe fn veor3q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
-    simd_xor(simd_xor(a, b), c)
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3u.v2i64")]
+        fn veor3q_u64_(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t;
+    }
+    veor3q_u64_(a, b, c)
 }
 
 /// Absolute difference between the arguments of Floating
@@ -7770,66 +7810,106 @@ pub unsafe fn vsubl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
 
 /// Bit clear and exclusive OR
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop))]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(bcax))]
 pub unsafe fn vbcaxq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
-    simd_xor(a, vbicq_s8(b, c))
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxs.v16i8")]
+        fn vbcaxq_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t;
+    }
+    vbcaxq_s8_(a, b, c)
 }
 
 /// Bit clear and exclusive OR
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop))]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(bcax))]
 pub unsafe fn vbcaxq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
-    simd_xor(a, vbicq_s16(b, c))
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxs.v8i16")]
+        fn vbcaxq_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t;
+    }
+    vbcaxq_s16_(a, b, c)
 }
 
 /// Bit clear and exclusive OR
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop))]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(bcax))]
 pub unsafe fn vbcaxq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
-    simd_xor(a, vbicq_s32(b, c))
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxs.v4i32")]
+        fn vbcaxq_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t;
+    }
+    vbcaxq_s32_(a, b, c)
 }
 
 /// Bit clear and exclusive OR
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop))]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(bcax))]
 pub unsafe fn vbcaxq_s64(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t {
-    simd_xor(a, vbicq_s64(b, c))
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxs.v2i64")]
+        fn vbcaxq_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t;
+    }
+    vbcaxq_s64_(a, b, c)
 }
 
 /// Bit clear and exclusive OR
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop))]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(bcax))]
 pub unsafe fn vbcaxq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
-    simd_xor(a, vbicq_u8(b, c))
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxu.v16i8")]
+        fn vbcaxq_u8_(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t;
+    }
+    vbcaxq_u8_(a, b, c)
 }
 
 /// Bit clear and exclusive OR
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop))]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(bcax))]
 pub unsafe fn vbcaxq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
-    simd_xor(a, vbicq_u16(b, c))
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxu.v8i16")]
+        fn vbcaxq_u16_(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t;
+    }
+    vbcaxq_u16_(a, b, c)
 }
 
 /// Bit clear and exclusive OR
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop))]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(bcax))]
 pub unsafe fn vbcaxq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
-    simd_xor(a, vbicq_u32(b, c))
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxu.v4i32")]
+        fn vbcaxq_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t;
+    }
+    vbcaxq_u32_(a, b, c)
 }
 
 /// Bit clear and exclusive OR
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(nop))]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(bcax))]
 pub unsafe fn vbcaxq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
-    simd_xor(a, vbicq_u64(b, c))
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxu.v2i64")]
+        fn vbcaxq_u64_(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t;
+    }
+    vbcaxq_u64_(a, b, c)
 }
 
 /// Floating-point complex add
@@ -11886,6 +11966,136 @@ pub unsafe fn vshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> ui
     simd_shuffle4!(a, vshrn_n_u64::<N>(b), [0, 1, 2, 3])
 }
 
+/// SM3PARTW1
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3partw1))]
+pub unsafe fn vsm3partw1q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3partw1")]
+        fn vsm3partw1q_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t;
+    }
+    vsm3partw1q_u32_(a, b, c)
+}
+
+/// SM3PARTW2
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3partw2))]
+pub unsafe fn vsm3partw2q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3partw2")]
+        fn vsm3partw2q_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t;
+    }
+    vsm3partw2q_u32_(a, b, c)
+}
+
+/// SM3SS1
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3ss1))]
+pub unsafe fn vsm3ss1q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3ss1")]
+        fn vsm3ss1q_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t;
+    }
+    vsm3ss1q_u32_(a, b, c)
+}
+
+/// SM4 key
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm4ekey))]
+pub unsafe fn vsm4ekeyq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm4ekey")]
+        fn vsm4ekeyq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+    vsm4ekeyq_u32_(a, b)
+}
+
+/// SM4 encode
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm4e))]
+pub unsafe fn vsm4eq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm4e")]
+        fn vsm4eq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+    vsm4eq_u32_(a, b)
+}
+
+/// Rotate and exclusive OR
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(rax1))]
+pub unsafe fn vrax1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.rax1")]
+        fn vrax1q_u64_(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t;
+    }
+    vrax1q_u64_(a, b)
+}
+
+/// SHA512 hash update part 1
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(sha512h))]
+pub unsafe fn vsha512hq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha512h")]
+        fn vsha512hq_u64_(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t;
+    }
+    vsha512hq_u64_(a, b, c)
+}
+
+/// SHA512 hash update part 2
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(sha512h2))]
+pub unsafe fn vsha512h2q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha512h2")]
+        fn vsha512h2q_u64_(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t;
+    }
+    vsha512h2q_u64_(a, b, c)
+}
+
+/// SHA512 schedule update 0
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(sha512su0))]
+pub unsafe fn vsha512su0q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha512su0")]
+        fn vsha512su0q_u64_(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t;
+    }
+    vsha512su0q_u64_(a, b)
+}
+
+/// SHA512 schedule update 1
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(sha512su1))]
+pub unsafe fn vsha512su1q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha512su1")]
+        fn vsha512su1q_u64_(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t;
+    }
+    vsha512su1q_u64_(a, b, c)
+}
+
 /// Transpose vectors
 #[inline]
 #[target_feature(enable = "neon")]
@@ -13086,7 +13296,7 @@ mod test {
     use std::mem::transmute;
     use stdarch_test::simd_test;
 
-    #[simd_test(enable = "neon")]
+    #[simd_test(enable = "neon,sha3")]
     unsafe fn test_veor3q_s8() {
         let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
         let b: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
@@ -13096,7 +13306,7 @@ mod test {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
+    #[simd_test(enable = "neon,sha3")]
     unsafe fn test_veor3q_s16() {
         let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
         let b: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
@@ -13106,7 +13316,7 @@ mod test {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
+    #[simd_test(enable = "neon,sha3")]
     unsafe fn test_veor3q_s32() {
         let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
         let b: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
@@ -13116,7 +13326,7 @@ mod test {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
+    #[simd_test(enable = "neon,sha3")]
     unsafe fn test_veor3q_s64() {
         let a: i64x2 = i64x2::new(0x00, 0x01);
         let b: i64x2 = i64x2::new(0x00, 0x00);
@@ -13126,7 +13336,7 @@ mod test {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
+    #[simd_test(enable = "neon,sha3")]
     unsafe fn test_veor3q_u8() {
         let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
         let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
@@ -13136,7 +13346,7 @@ mod test {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
+    #[simd_test(enable = "neon,sha3")]
     unsafe fn test_veor3q_u16() {
         let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
         let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
@@ -13146,7 +13356,7 @@ mod test {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
+    #[simd_test(enable = "neon,sha3")]
     unsafe fn test_veor3q_u32() {
         let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
         let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
@@ -13156,7 +13366,7 @@ mod test {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
+    #[simd_test(enable = "neon,sha3")]
     unsafe fn test_veor3q_u64() {
         let a: u64x2 = u64x2::new(0x00, 0x01);
         let b: u64x2 = u64x2::new(0x00, 0x00);
@@ -19212,7 +19422,7 @@ mod test {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
+    #[simd_test(enable = "neon,sha3")]
     unsafe fn test_vbcaxq_s8() {
         let a: i8x16 = i8x16::new(1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0);
         let b: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -19222,7 +19432,7 @@ mod test {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
+    #[simd_test(enable = "neon,sha3")]
     unsafe fn test_vbcaxq_s16() {
         let a: i16x8 = i16x8::new(1, 0, 1, 0, 1, 0, 1, 0);
         let b: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
@@ -19232,7 +19442,7 @@ mod test {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
+    #[simd_test(enable = "neon,sha3")]
     unsafe fn test_vbcaxq_s32() {
         let a: i32x4 = i32x4::new(1, 0, 1, 0);
         let b: i32x4 = i32x4::new(0, 1, 2, 3);
@@ -19242,7 +19452,7 @@ mod test {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
+    #[simd_test(enable = "neon,sha3")]
     unsafe fn test_vbcaxq_s64() {
         let a: i64x2 = i64x2::new(1, 0);
         let b: i64x2 = i64x2::new(0, 1);
@@ -19252,7 +19462,7 @@ mod test {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
+    #[simd_test(enable = "neon,sha3")]
     unsafe fn test_vbcaxq_u8() {
         let a: u8x16 = u8x16::new(1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0);
         let b: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
@@ -19262,7 +19472,7 @@ mod test {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
+    #[simd_test(enable = "neon,sha3")]
     unsafe fn test_vbcaxq_u16() {
         let a: u16x8 = u16x8::new(1, 0, 1, 0, 1, 0, 1, 0);
         let b: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
@@ -19272,7 +19482,7 @@ mod test {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
+    #[simd_test(enable = "neon,sha3")]
     unsafe fn test_vbcaxq_u32() {
         let a: u32x4 = u32x4::new(1, 0, 1, 0);
         let b: u32x4 = u32x4::new(0, 1, 2, 3);
@@ -19282,7 +19492,7 @@ mod test {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
+    #[simd_test(enable = "neon,sha3")]
     unsafe fn test_vbcaxq_u64() {
         let a: u64x2 = u64x2::new(1, 0);
         let b: u64x2 = u64x2::new(0, 1);
@@ -22841,6 +23051,102 @@ mod test {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm3partw1q_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let c: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vsm3partw1q_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm3partw2q_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let c: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vsm3partw2q_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm3ss1q_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let c: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vsm3ss1q_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm4ekeyq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vsm4ekeyq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm4eq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vsm4eq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vrax1q_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u64x2 = u64x2::new(3, 4);
+        let e: u64x2 = u64x2::new(5, 6);
+        let r: u64x2 = transmute(vrax1q_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vsha512hq_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u64x2 = u64x2::new(3, 4);
+        let c: u64x2 = u64x2::new(5, 6);
+        let e: u64x2 = u64x2::new(7, 8);
+        let r: u64x2 = transmute(vsha512hq_u64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vsha512h2q_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u64x2 = u64x2::new(3, 4);
+        let c: u64x2 = u64x2::new(5, 6);
+        let e: u64x2 = u64x2::new(7, 8);
+        let r: u64x2 = transmute(vsha512h2q_u64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vsha512su0q_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u64x2 = u64x2::new(3, 4);
+        let e: u64x2 = u64x2::new(7, 8);
+        let r: u64x2 = transmute(vsha512su0q_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vsha512su1q_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u64x2 = u64x2::new(3, 4);
+        let c: u64x2 = u64x2::new(5, 6);
+        let e: u64x2 = u64x2::new(7, 8);
+        let r: u64x2 = transmute(vsha512su1q_u64(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "neon")]
     unsafe fn test_vtrn1_s8() {
         let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
diff --git a/crates/core_arch/src/aarch64/neon/mod.rs b/crates/core_arch/src/aarch64/neon/mod.rs
index 1cc10dc15d..06044e8569 100644
--- a/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/crates/core_arch/src/aarch64/neon/mod.rs
@@ -3208,6 +3208,97 @@ pub unsafe fn vsriq_n_p64<const N: i32>(a: poly64x2_t, b: poly64x2_t) -> poly64x
     transmute(vsriq_n_s64_(transmute(a), transmute(b), N))
 }
 
+/// SM3TT1A
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3tt1a, IMM2 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vsm3tt1aq_u32<const IMM2: i32>(
+    a: uint32x4_t,
+    b: uint32x4_t,
+    c: uint32x4_t,
+) -> uint32x4_t {
+    static_assert_imm2!(IMM2);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3tt1a")]
+        fn vsm3tt1aq_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, imm2: i64) -> uint32x4_t;
+    }
+    vsm3tt1aq_u32_(a, b, c, IMM2 as i64)
+}
+
+/// SM3TT1B
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3tt1b, IMM2 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vsm3tt1bq_u32<const IMM2: i32>(
+    a: uint32x4_t,
+    b: uint32x4_t,
+    c: uint32x4_t,
+) -> uint32x4_t {
+    static_assert_imm2!(IMM2);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3tt1b")]
+        fn vsm3tt1bq_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, imm2: i64) -> uint32x4_t;
+    }
+    vsm3tt1bq_u32_(a, b, c, IMM2 as i64)
+}
+
+/// SM3TT2A
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3tt2a, IMM2 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vsm3tt2aq_u32<const IMM2: i32>(
+    a: uint32x4_t,
+    b: uint32x4_t,
+    c: uint32x4_t,
+) -> uint32x4_t {
+    static_assert_imm2!(IMM2);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3tt2a")]
+        fn vsm3tt2aq_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, imm2: i64) -> uint32x4_t;
+    }
+    vsm3tt2aq_u32_(a, b, c, IMM2 as i64)
+}
+
+/// SM3TT2B
+#[inline]
+#[target_feature(enable = "neon,sm4")]
+#[cfg_attr(test, assert_instr(sm3tt2b, IMM2 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vsm3tt2bq_u32<const IMM2: i32>(
+    a: uint32x4_t,
+    b: uint32x4_t,
+    c: uint32x4_t,
+) -> uint32x4_t {
+    static_assert_imm2!(IMM2);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3tt2b")]
+        fn vsm3tt2bq_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, imm2: i64) -> uint32x4_t;
+    }
+    vsm3tt2bq_u32_(a, b, c, IMM2 as i64)
+}
+
+/// Exclusive OR and rotate
+#[inline]
+#[target_feature(enable = "neon,sha3")]
+#[cfg_attr(test, assert_instr(xar, IMM6 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vxarq_u64<const IMM6: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert_imm6!(IMM6);
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.xar")]
+        fn vxarq_u64_(a: uint64x2_t, b: uint64x2_t, n: i64) -> uint64x2_t;
+    }
+    vxarq_u64_(a, b, IMM6 as i64)
+}
+
 #[cfg(test)]
 mod tests {
     use crate::core_arch::aarch64::test_support::*;
@@ -4866,6 +4957,55 @@ mod tests {
         assert_eq!(vals[1], 1.);
         assert_eq!(vals[2], 2.);
     }
+
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm3tt1aq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let c: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vsm3tt1aq_u32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm3tt1bq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let c: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vsm3tt1bq_u32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm3tt2aq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let c: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vsm3tt2aq_u32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sm4")]
+    unsafe fn test_vsm3tt2bq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let c: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vsm3tt2bq_u32::<0>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,sha3")]
+    unsafe fn test_vxarq_u64() {
+        let a: u64x2 = u64x2::new(1, 2);
+        let b: u64x2 = u64x2::new(1, 2);
+        let e: u64x2 = u64x2::new(5, 6);
+        let r: u64x2 = transmute(vxarq_u64::<0>(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
 }
 
 #[cfg(test)]
diff --git a/crates/core_arch/src/arm_shared/neon/mod.rs b/crates/core_arch/src/arm_shared/neon/mod.rs
index 588c86537d..b11df45919 100644
--- a/crates/core_arch/src/arm_shared/neon/mod.rs
+++ b/crates/core_arch/src/arm_shared/neon/mod.rs
@@ -4806,6 +4806,63 @@ pub unsafe fn vpadalq_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
     }
 }
 
+/// 8-bit integer matrix multiply-accumulate
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsmmla))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smmla))]
+pub unsafe fn vmmlaq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.smmla.v4i32.v16i8")]
+        #[cfg_attr(
+            target_arch = "aarch64",
+            link_name = "llvm.aarch64.neon.smmla.v4i32.v16i8"
+        )]
+        fn vmmlaq_s32_(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t;
+    }
+    vmmlaq_s32_(a, b, c)
+}
+
+/// 8-bit integer matrix multiply-accumulate
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vummla))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ummla))]
+pub unsafe fn vmmlaq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.ummla.v4i32.v16i8")]
+        #[cfg_attr(
+            target_arch = "aarch64",
+            link_name = "llvm.aarch64.neon.ummla.v4i32.v16i8"
+        )]
+        fn vmmlaq_u32_(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t;
+    }
+    vmmlaq_u32_(a, b, c)
+}
+
+/// Unsigned and signed 8-bit integer matrix multiply-accumulate
+#[inline]
+#[target_feature(enable = "neon,i8mm")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusmmla))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usmmla))]
+pub unsafe fn vusmmlaq_s32(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.usmmla.v4i32.v16i8")]
+        #[cfg_attr(
+            target_arch = "aarch64",
+            link_name = "llvm.aarch64.neon.usmmla.v4i32.v16i8"
+        )]
+        fn vusmmlaq_s32_(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t;
+    }
+    vusmmlaq_s32_(a, b, c)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -10368,6 +10425,35 @@ mod tests {
         let e: u16x8 = transmute(vrev64q_p16(transmute(a)));
         assert_eq!(r, e);
     }
+    #[simd_test(enable = "neon,i8mm")]
+    unsafe fn test_vmmlaq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let c: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let e: i32x4 = i32x4::new(1, 2, 3, 4);
+        let r: i32x4 = transmute(vmmlaq_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,i8mm")]
+    unsafe fn test_vmmlaq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let c: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let r: u32x4 = transmute(vmmlaq_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,i8mm")]
+    unsafe fn test_vusmmlaq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let c: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let e: i32x4 = i32x4::new(1, 2, 3, 4);
+        let r: i32x4 = transmute(vusmmlaq_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
 }
 
 #[cfg(all(test, target_arch = "arm", target_endian = "little"))]
diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs
index 33b0627d61..d43192b399 100644
--- a/crates/core_arch/src/lib.rs
+++ b/crates/core_arch/src/lib.rs
@@ -33,7 +33,8 @@
     f16c_target_feature,
     allow_internal_unstable,
     decl_macro,
-    bench_black_box
+    bench_black_box,
+    asm_const
 )]
 #![cfg_attr(test, feature(test, abi_vectorcall))]
 #![deny(clippy::missing_inline_in_public_items)]
diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec
index 0dca59839b..a034873504 100644
--- a/crates/stdarch-gen/neon.spec
+++ b/crates/stdarch-gen/neon.spec
@@ -104,15 +104,16 @@ generate int*_t, uint*_t, int64x*_t, uint64x*_t
 
 /// Three-way exclusive OR
 name = veor3
-multi_fn = simd_xor, {simd_xor, a, b}, c
 a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
 b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 c = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F
+target = sha3
 
-// llvm does not currently support `eor3` instructions
-aarch64 = nop
+aarch64 = eor3
+link-aarch64 = llvm.aarch64.crypto.eor3s._EXT_
 generate int8x16_t, int16x8_t, int32x4_t, int64x2_t
+link-aarch64 = llvm.aarch64.crypto.eor3u._EXT_
 generate uint8x16_t, uint16x8_t, uint32x4_t, uint64x2_t
 
 ////////////////////
@@ -4438,15 +4439,16 @@ generate uint32x4_t:uint32x4_t:uint64x2_t
 
 /// Bit clear and exclusive OR
 name = vbcax
-multi_fn = simd_xor, a, {vbic-self-noext, b, c}
 a = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0
 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 c = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
 validate 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
+target = sha3
 
-// llvm does not currently support the `bcax` instruction
-aarch64 = nop
+aarch64 = bcax
+link-aarch64 = llvm.aarch64.crypto.bcaxs._EXT_
 generate int8x16_t, int16x8_t, int32x4_t, int64x2_t
+link-aarch64 = llvm.aarch64.crypto.bcaxu._EXT_
 generate uint8x16_t, uint16x8_t, uint32x4_t, uint64x2_t
 
 /// Floating-point complex add
@@ -4805,24 +4807,6 @@ generate float32x2_t:f32, float64x2_t:f64
 aarch64 = fminnmv
 generate float32x4_t:f32
 
-/// 8-bit integer matrix multiply-accumulate
-name = vmmlaq
-a = 1, 2, 3, 4
-b = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-c = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-validate 1, 2, 3, 4
-target = i8mm
-
-aarch64 = smmla
-link-aarch64 = smmla._EXT_._EXT3_
-// the feature `i8mm` is not valid for some target
-//generate int32x4_t:int8x16_t:int8x16_t:int32x4_t
-
-aarch64 = ummla
-link-aarch64 = ummla._EXT_._EXT3_
-// the feature `i8mm` is not valid for some target
-//generate uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t
-
 /// Vector move
 name = vmovl_high
 no-q
@@ -6862,6 +6846,163 @@ aarch64 = usra
 arm = vsra
 generate uint*_t, uint64x*_t
 
+/// SM3PARTW1
+name = vsm3partw1
+a = 1, 2, 3, 4
+b = 1, 2, 3, 4
+c = 1, 2, 3, 4
+validate 1, 2, 3, 4
+target = sm4
+
+aarch64 = sm3partw1
+link-aarch64 = llvm.aarch64.crypto.sm3partw1
+generate uint32x4_t
+
+/// SM3PARTW2
+name = vsm3partw2
+a = 1, 2, 3, 4
+b = 1, 2, 3, 4
+c = 1, 2, 3, 4
+validate 1, 2, 3, 4
+target = sm4
+
+aarch64 = sm3partw2
+link-aarch64 = llvm.aarch64.crypto.sm3partw2
+generate uint32x4_t
+
+/// SM3SS1
+name = vsm3ss1
+a = 1, 2, 3, 4
+b = 1, 2, 3, 4
+c = 1, 2, 3, 4
+validate 1, 2, 3, 4
+target = sm4
+
+aarch64 = sm3ss1
+link-aarch64 = llvm.aarch64.crypto.sm3ss1
+generate uint32x4_t
+
+/// SM4 key
+name = vsm4ekey
+a = 1, 2, 3, 4
+b = 1, 2, 3, 4
+validate 1, 2, 3, 4
+target = sm4
+
+aarch64 = sm4ekey
+link-aarch64 = llvm.aarch64.crypto.sm4ekey
+generate uint32x4_t
+
+/// SM4 encode
+name = vsm4e
+a = 1, 2, 3, 4
+b = 1, 2, 3, 4
+validate 1, 2, 3, 4
+target = sm4
+
+aarch64 = sm4e
+link-aarch64 = llvm.aarch64.crypto.sm4e
+generate uint32x4_t
+
+/// Rotate and exclusive OR
+name = vrax1
+a = 1, 2
+b = 3, 4
+validate 5, 6
+target = sha3
+
+aarch64 = rax1
+link-aarch64 = llvm.aarch64.crypto.rax1
+generate uint64x2_t
+
+/// SHA512 hash update part 1
+name = vsha512h
+a = 1, 2
+b = 3, 4
+c = 5, 6
+validate 7, 8
+target = sha3
+
+aarch64 = sha512h
+link-aarch64 = llvm.aarch64.crypto.sha512h
+generate uint64x2_t
+
+/// SHA512 hash update part 2
+name = vsha512h2
+a = 1, 2
+b = 3, 4
+c = 5, 6
+validate 7, 8
+target = sha3
+
+aarch64 = sha512h2
+link-aarch64 = llvm.aarch64.crypto.sha512h2
+generate uint64x2_t
+
+/// SHA512 schedule update 0
+name = vsha512su0
+a = 1, 2
+b = 3, 4
+validate 7, 8
+target = sha3
+
+aarch64 = sha512su0
+link-aarch64 = llvm.aarch64.crypto.sha512su0
+generate uint64x2_t
+
+/// SHA512 schedule update 1
+name = vsha512su1
+a = 1, 2
+b = 3, 4
+c = 5, 6
+validate 7, 8
+target = sha3
+
+aarch64 = sha512su1
+link-aarch64 = llvm.aarch64.crypto.sha512su1
+generate uint64x2_t
+
+/// Floating-point round to 32-bit integer, using current rounding mode
+name = vrnd32x
+a = 1.1, 1.9, -1.7, -2.3
+validate 1.0, 2.0, -2.0, -2.0
+target = v8.5a
+
+aarch64 = frint32x
+link-aarch64 = frint32x._EXT_
+// v8.5a is no_runtime feature
+//generate float32x2_t, float32x4_t
+
+/// Floating-point round to 32-bit integer toward zero
+name = vrnd32z
+a = 1.1, 1.9, -1.7, -2.3
+validate 1.0, 1.0, -1.0, -2.0
+target = v8.5a
+
+aarch64 = frint32z
+link-aarch64 = frint32z._EXT_
+//generate float32x2_t, float32x4_t
+
+/// Floating-point round to 64-bit integer, using current rounding mode
+name = vrnd64x
+a = 1.1, 1.9, -1.7, -2.3
+validate 1.0, 2.0, -2.0, -2.0
+target = v8.5a
+
+aarch64 = frint64x
+link-aarch64 = frint64x._EXT_
+//generate float32x2_t, float32x4_t
+
+/// Floating-point round to 64-bit integer toward zero
+name = vrnd64z
+a = 1.1, 1.9, -1.7, -2.3
+validate 1.0, 1.0, -1.0, -2.0
+target = v8.5a
+
+aarch64 = frint64z
+link-aarch64 = frint64z._EXT_
+//generate float32x2_t, float32x4_t
+
 /// Transpose elements
 name = vtrn
 multi_fn = simd_shuffle-in_len-!, a1:in_t, a, b, {transpose-1-in_len}
diff --git a/crates/stdarch-gen/src/main.rs b/crates/stdarch-gen/src/main.rs
index a33933ad97..aa516f73fe 100644
--- a/crates/stdarch-gen/src/main.rs
+++ b/crates/stdarch-gen/src/main.rs
@@ -464,7 +464,10 @@ enum TargetFeature {
     FCMA,
     Dotprod,
     I8MM,
+    SHA3,
     RDM,
+    SM4,
+    V85a,
 }
 
 #[derive(Clone, Copy)]
@@ -1068,7 +1071,10 @@ fn gen_aarch64(
         FCMA => "neon,fcma",
         Dotprod => "neon,dotprod",
         I8MM => "neon,i8mm",
+        SHA3 => "neon,sha3",
         RDM => "rdm",
+        SM4 => "neon,sm4",
+        V85a => "neon,v8.5a",
     };
     let current_fn = if let Some(current_fn) = current_fn.clone() {
         if link_aarch64.is_some() {
@@ -1379,6 +1385,13 @@ fn gen_aarch64(
         fn_decl,
         call_params
     );
+    let test_target = match target {
+        I8MM => "neon,i8mm",
+        SM4 => "neon,sm4",
+        SHA3 => "neon,sha3",
+        V85a => "neon,v8.5a",
+        _ => "neon",
+    };
     let test = match fn_type {
         Fntype::Normal => gen_test(
             &name,
@@ -1388,6 +1401,7 @@ fn gen_aarch64(
             [type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])],
             type_len(out_t),
             para_num,
+            test_target,
         ),
         Fntype::Load => gen_load_test(&name, in_t, &out_t, current_tests, type_len(out_t)),
         Fntype::Store => gen_store_test(&name, in_t, &out_t, current_tests, type_len(in_t[1])),
@@ -1575,12 +1589,13 @@ fn gen_test(
     len_in: [usize; 3],
     len_out: usize,
     para_num: i32,
+    target: &str,
 ) -> String {
     let mut test = format!(
         r#"
-    #[simd_test(enable = "neon")]
+    #[simd_test(enable = "{}")]
     unsafe fn test_{}() {{"#,
-        name,
+        target, name,
     );
     for (a, b, c, n, e) in current_tests {
         let a: Vec<String> = a.iter().take(len_in[0]).cloned().collect();
@@ -1777,7 +1792,10 @@ fn gen_arm(
         FCMA => "neon,fcma",
         Dotprod => "neon,dotprod",
         I8MM => "neon,i8mm",
+        SHA3 => "neon,sha3",
         RDM => "rdm",
+        SM4 => "neon,sm4",
+        V85a => "neon,v8.5a",
     };
     let current_target_arm = match target {
         Default => "v7",
@@ -1787,8 +1805,11 @@ fn gen_arm(
         AES => "aes,v8",
         FCMA => "v8",    // v8.3a
         Dotprod => "v8", // v8.2a
-        I8MM => "v8",    // v8.6a
+        I8MM => "v8,i8mm",
         RDM => unreachable!(),
+        SM4 => unreachable!(),
+        SHA3 => unreachable!(),
+        V85a => unreachable!(),
     };
     let current_fn = if let Some(current_fn) = current_fn.clone() {
         if link_aarch64.is_some() || link_arm.is_some() {
@@ -2364,6 +2385,13 @@ fn gen_arm(
             call,
         )
     };
+    let test_target = match target {
+        I8MM => "neon,i8mm",
+        SM4 => "neon,sm4",
+        SHA3 => "neon,sha3",
+        V85a => "neon,v8.5a",
+        _ => "neon",
+    };
     let test = match fn_type {
         Fntype::Normal => gen_test(
             &name,
@@ -2373,6 +2401,7 @@ fn gen_arm(
             [type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])],
             type_len(out_t),
             para_num,
+            test_target,
         ),
         Fntype::Load => gen_load_test(&name, in_t, &out_t, current_tests, type_len(out_t)),
         Fntype::Store => gen_store_test(&name, in_t, &out_t, current_tests, type_len(in_t[1])),
@@ -3173,7 +3202,10 @@ mod test {
                     "fcma" => FCMA,
                     "dotprod" => Dotprod,
                     "i8mm" => I8MM,
+                    "sha3" => SHA3,
                     "rdm" => RDM,
+                    "sm4" => SM4,
+                    "v8.5a" => V85a,
                     _ => Default,
                 },
                 _ => Default,
@@ -3278,20 +3310,22 @@ mod test {
     tests_aarch64.push('}');
     tests_aarch64.push('\n');
 
-    let arm_out_path: PathBuf = PathBuf::from(env::var("OUT_DIR").unwrap())
-        .join("src")
-        .join("arm_shared")
-        .join("neon");
+    let arm_out_path: PathBuf =
+        PathBuf::from(env::var("OUT_DIR").unwrap_or("crates/core_arch".to_string()))
+            .join("src")
+            .join("arm_shared")
+            .join("neon");
     std::fs::create_dir_all(&arm_out_path)?;
 
     let mut file_arm = File::create(arm_out_path.join(ARM_OUT))?;
     file_arm.write_all(out_arm.as_bytes())?;
     file_arm.write_all(tests_arm.as_bytes())?;
 
-    let aarch64_out_path: PathBuf = PathBuf::from(env::var("OUT_DIR").unwrap())
-        .join("src")
-        .join("aarch64")
-        .join("neon");
+    let aarch64_out_path: PathBuf =
+        PathBuf::from(env::var("OUT_DIR").unwrap_or("crates/core_arch".to_string()))
+            .join("src")
+            .join("aarch64")
+            .join("neon");
     std::fs::create_dir_all(&aarch64_out_path)?;
 
     let mut file_aarch = File::create(aarch64_out_path.join(AARCH64_OUT))?;
diff --git a/crates/stdarch-verify/tests/arm.rs b/crates/stdarch-verify/tests/arm.rs
index bd894e0baa..ce7039ce72 100644
--- a/crates/stdarch-verify/tests/arm.rs
+++ b/crates/stdarch-verify/tests/arm.rs
@@ -559,6 +559,32 @@ fn verify_all_signatures() {
             "vaddq_p16",
             "vaddq_p64",
             "vaddq_p128",
+            "vsm4ekeyq_u32",
+            "vsm4eq_u32",
+            "vmmlaq_s32",
+            "vmmlaq_u32",
+            "vusmmlaq_s32",
+            "vsm3partw1q_u32",
+            "vsm3partw2q_u32",
+            "vsm3ss1q_u32",
+            "vsm3tt1aq_u32",
+            "vsm3tt1bq_u32",
+            "vsm3tt2aq_u32",
+            "vsm3tt2bq_u32",
+            "vrax1q_u64",
+            "vxarq_u64",
+            "vsha512hq_u64",
+            "vsha512h2q_u64",
+            "vsha512su0q_u64",
+            "vsha512su1q_u64",
+            "vrnd32x_f32",
+            "vrnd32xq_f32",
+            "vrnd32z_f32",
+            "vrnd32zq_f32",
+            "vrnd64x_f32",
+            "vrnd64xq_f32",
+            "vrnd64z_f32",
+            "vrnd64zq_f32",
             "__dbg",
         ];
         let arm = match map.get(rust.name) {

From 2fc5931f4881598f2c7d6cfc66704819b7b68721 Mon Sep 17 00:00:00 2001
From: SparrowLii <liyuan179@huawei.com>
Date: Tue, 9 Nov 2021 19:00:29 +0800
Subject: [PATCH 2/5] update runtime feature detect

---
 crates/std_detect/src/detect/arch/arm.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crates/std_detect/src/detect/arch/arm.rs b/crates/std_detect/src/detect/arch/arm.rs
index d96514c844..9e7dda094f 100644
--- a/crates/std_detect/src/detect/arch/arm.rs
+++ b/crates/std_detect/src/detect/arch/arm.rs
@@ -22,4 +22,6 @@ features! {
     /// FEAT_AES (AES instructions)
     @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] sha2: "sha2";
     /// FEAT_SHA1 & FEAT_SHA256 (SHA1 & SHA2-256 instructions)
+    @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] i8mm: "i8mm";
+    /// FEAT_I8MM
 }

From d5ec8fad0fbc41b3f5be2cf8c262ce3e336540f2 Mon Sep 17 00:00:00 2001
From: SparrowLii <liyuan179@huawei.com>
Date: Tue, 9 Nov 2021 19:45:15 +0800
Subject: [PATCH 3/5] correct tests

---
 .../core_arch/src/aarch64/neon/generated.rs   | 20 ++++++++--------
 crates/core_arch/src/aarch64/neon/mod.rs      | 12 +++++-----
 crates/core_arch/src/arm_shared/neon/mod.rs   | 24 +++++++++----------
 crates/stdarch-gen/neon.spec                  | 20 ++++++++--------
 4 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
index efa637d1c3..abcf75f290 100644
--- a/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/crates/core_arch/src/aarch64/neon/generated.rs
@@ -23056,7 +23056,7 @@ mod test {
         let a: u32x4 = u32x4::new(1, 2, 3, 4);
         let b: u32x4 = u32x4::new(1, 2, 3, 4);
         let c: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(2147549312, 3221323968, 131329, 2684362752);
         let r: u32x4 = transmute(vsm3partw1q_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
@@ -23066,7 +23066,7 @@ mod test {
         let a: u32x4 = u32x4::new(1, 2, 3, 4);
         let b: u32x4 = u32x4::new(1, 2, 3, 4);
         let c: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(128, 256, 384, 1077977696);
         let r: u32x4 = transmute(vsm3partw2q_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
@@ -23076,7 +23076,7 @@ mod test {
         let a: u32x4 = u32x4::new(1, 2, 3, 4);
         let b: u32x4 = u32x4::new(1, 2, 3, 4);
         let c: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(0, 0, 0, 2098176);
         let r: u32x4 = transmute(vsm3ss1q_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
@@ -23085,7 +23085,7 @@ mod test {
     unsafe fn test_vsm4ekeyq_u32() {
         let a: u32x4 = u32x4::new(1, 2, 3, 4);
         let b: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(1784948604, 136020997, 2940231695, 3789947679);
         let r: u32x4 = transmute(vsm4ekeyq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
@@ -23094,7 +23094,7 @@ mod test {
     unsafe fn test_vsm4eq_u32() {
         let a: u32x4 = u32x4::new(1, 2, 3, 4);
         let b: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(1093874472, 3616769504, 3878330411, 2765298765);
         let r: u32x4 = transmute(vsm4eq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
@@ -23103,7 +23103,7 @@ mod test {
     unsafe fn test_vrax1q_u64() {
         let a: u64x2 = u64x2::new(1, 2);
         let b: u64x2 = u64x2::new(3, 4);
-        let e: u64x2 = u64x2::new(5, 6);
+        let e: u64x2 = u64x2::new(7, 10);
         let r: u64x2 = transmute(vrax1q_u64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
@@ -23113,7 +23113,7 @@ mod test {
         let a: u64x2 = u64x2::new(1, 2);
         let b: u64x2 = u64x2::new(3, 4);
         let c: u64x2 = u64x2::new(5, 6);
-        let e: u64x2 = u64x2::new(7, 8);
+        let e: u64x2 = u64x2::new(11189044327219203, 7177611956453380);
         let r: u64x2 = transmute(vsha512hq_u64(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
@@ -23123,7 +23123,7 @@ mod test {
         let a: u64x2 = u64x2::new(1, 2);
         let b: u64x2 = u64x2::new(3, 4);
         let c: u64x2 = u64x2::new(5, 6);
-        let e: u64x2 = u64x2::new(7, 8);
+        let e: u64x2 = u64x2::new(5770237651009406214, 349133864969);
         let r: u64x2 = transmute(vsha512h2q_u64(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
@@ -23132,7 +23132,7 @@ mod test {
     unsafe fn test_vsha512su0q_u64() {
         let a: u64x2 = u64x2::new(1, 2);
         let b: u64x2 = u64x2::new(3, 4);
-        let e: u64x2 = u64x2::new(7, 8);
+        let e: u64x2 = u64x2::new(144115188075855874, 9439544818968559619);
         let r: u64x2 = transmute(vsha512su0q_u64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
@@ -23142,7 +23142,7 @@ mod test {
         let a: u64x2 = u64x2::new(1, 2);
         let b: u64x2 = u64x2::new(3, 4);
         let c: u64x2 = u64x2::new(5, 6);
-        let e: u64x2 = u64x2::new(7, 8);
+        let e: u64x2 = u64x2::new(105553116266526, 140737488355368);
         let r: u64x2 = transmute(vsha512su1q_u64(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
diff --git a/crates/core_arch/src/aarch64/neon/mod.rs b/crates/core_arch/src/aarch64/neon/mod.rs
index 06044e8569..d23e43c435 100644
--- a/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/crates/core_arch/src/aarch64/neon/mod.rs
@@ -4963,7 +4963,7 @@ mod tests {
         let a: u32x4 = u32x4::new(1, 2, 3, 4);
         let b: u32x4 = u32x4::new(1, 2, 3, 4);
         let c: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(2, 1536, 4, 16395);
         let r: u32x4 = transmute(vsm3tt1aq_u32::<0>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
@@ -4973,7 +4973,7 @@ mod tests {
         let a: u32x4 = u32x4::new(1, 2, 3, 4);
         let b: u32x4 = u32x4::new(1, 2, 3, 4);
         let c: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(2, 1536, 4, 16392);
         let r: u32x4 = transmute(vsm3tt1bq_u32::<0>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
@@ -4983,7 +4983,7 @@ mod tests {
         let a: u32x4 = u32x4::new(1, 2, 3, 4);
         let b: u32x4 = u32x4::new(1, 2, 3, 4);
         let c: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(2, 1572864, 4, 1447435);
         let r: u32x4 = transmute(vsm3tt2aq_u32::<0>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
@@ -4993,7 +4993,7 @@ mod tests {
         let a: u32x4 = u32x4::new(1, 2, 3, 4);
         let b: u32x4 = u32x4::new(1, 2, 3, 4);
         let c: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(2, 1572864, 4, 1052680);
         let r: u32x4 = transmute(vsm3tt2bq_u32::<0>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
@@ -5001,8 +5001,8 @@ mod tests {
     #[simd_test(enable = "neon,sha3")]
     unsafe fn test_vxarq_u64() {
         let a: u64x2 = u64x2::new(1, 2);
-        let b: u64x2 = u64x2::new(1, 2);
-        let e: u64x2 = u64x2::new(5, 6);
+        let b: u64x2 = u64x2::new(3, 4);
+        let e: u64x2 = u64x2::new(2, 6);
         let r: u64x2 = transmute(vxarq_u64::<0>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
diff --git a/crates/core_arch/src/arm_shared/neon/mod.rs b/crates/core_arch/src/arm_shared/neon/mod.rs
index b11df45919..15c659ded9 100644
--- a/crates/core_arch/src/arm_shared/neon/mod.rs
+++ b/crates/core_arch/src/arm_shared/neon/mod.rs
@@ -4810,7 +4810,7 @@ pub unsafe fn vpadalq_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
 #[inline]
 #[target_feature(enable = "neon,i8mm")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsmmla))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smmla))]
 pub unsafe fn vmmlaq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t {
     #[allow(improper_ctypes)]
@@ -4829,7 +4829,7 @@ pub unsafe fn vmmlaq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t
 #[inline]
 #[target_feature(enable = "neon,i8mm")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vummla))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ummla))]
 pub unsafe fn vmmlaq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t {
     #[allow(improper_ctypes)]
@@ -4848,7 +4848,7 @@ pub unsafe fn vmmlaq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x
 #[inline]
 #[target_feature(enable = "neon,i8mm")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vusmmla))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usmmla))]
 pub unsafe fn vusmmlaq_s32(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t {
     #[allow(improper_ctypes)]
@@ -10427,9 +10427,9 @@ mod tests {
     }
     #[simd_test(enable = "neon,i8mm")]
     unsafe fn test_vmmlaq_s32() {
-        let a: i32x4 = i32x4::new(1, 2, 3, 4);
-        let b: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let c: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let a: i32x4 = i32x4::new(1, 3, 4, 9);
+        let b: i8x16 = i8x16::new(1, 21, 31, 14, 5, 6, 17, 8, 9, 13, 15, 12, 13, 19, 20, 16);
+        let c: i8x16 = i8x16::new(12, 22, 3, 4, 5, 56, 7, 8, 91, 10, 11, 15, 13, 14, 17, 16);
         let e: i32x4 = i32x4::new(1, 2, 3, 4);
         let r: i32x4 = transmute(vmmlaq_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
@@ -10437,9 +10437,9 @@ mod tests {
 
     #[simd_test(enable = "neon,i8mm")]
     unsafe fn test_vmmlaq_u32() {
-        let a: u32x4 = u32x4::new(1, 2, 3, 4);
-        let b: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let c: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let a: u32x4 = u32x4::new(1, 3, 4, 9);
+        let b: i8x16 = i8x16::new(1, 21, 31, 14, 5, 6, 17, 8, 9, 13, 15, 12, 13, 19, 20, 16);
+        let c: i8x16 = i8x16::new(12, 22, 3, 4, 5, 56, 7, 8, 91, 10, 11, 15, 13, 14, 17, 16);
         let e: u32x4 = u32x4::new(1, 2, 3, 4);
         let r: u32x4 = transmute(vmmlaq_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
@@ -10447,9 +10447,9 @@ mod tests {
 
     #[simd_test(enable = "neon,i8mm")]
     unsafe fn test_vusmmlaq_s32() {
-        let a: i32x4 = i32x4::new(1, 2, 3, 4);
-        let b: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let c: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let a: i32x4 = i32x4::new(1, 3, 4, 9);
+        let b: i8x16 = i8x16::new(1, 21, 31, 14, 5, 6, 17, 8, 9, 13, 15, 12, 13, 19, 20, 16);
+        let c: i8x16 = i8x16::new(12, 22, 3, 4, 5, 56, 7, 8, 91, 10, 11, 15, 13, 14, 17, 16);
         let e: i32x4 = i32x4::new(1, 2, 3, 4);
         let r: i32x4 = transmute(vusmmlaq_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec
index a034873504..be5ea6eb85 100644
--- a/crates/stdarch-gen/neon.spec
+++ b/crates/stdarch-gen/neon.spec
@@ -6851,7 +6851,7 @@ name = vsm3partw1
 a = 1, 2, 3, 4
 b = 1, 2, 3, 4
 c = 1, 2, 3, 4
-validate 1, 2, 3, 4
+validate 2147549312, 3221323968, 131329, 2684362752
 target = sm4
 
 aarch64 = sm3partw1
@@ -6863,7 +6863,7 @@ name = vsm3partw2
 a = 1, 2, 3, 4
 b = 1, 2, 3, 4
 c = 1, 2, 3, 4
-validate 1, 2, 3, 4
+validate 128, 256, 384, 1077977696
 target = sm4
 
 aarch64 = sm3partw2
@@ -6875,7 +6875,7 @@ name = vsm3ss1
 a = 1, 2, 3, 4
 b = 1, 2, 3, 4
 c = 1, 2, 3, 4
-validate 1, 2, 3, 4
+validate 0, 0, 0, 2098176
 target = sm4
 
 aarch64 = sm3ss1
@@ -6886,7 +6886,7 @@ generate uint32x4_t
 name = vsm4ekey
 a = 1, 2, 3, 4
 b = 1, 2, 3, 4
-validate 1, 2, 3, 4
+validate 1784948604, 136020997, 2940231695, 3789947679
 target = sm4
 
 aarch64 = sm4ekey
@@ -6897,7 +6897,7 @@ generate uint32x4_t
 name = vsm4e
 a = 1, 2, 3, 4
 b = 1, 2, 3, 4
-validate 1, 2, 3, 4
+validate 1093874472, 3616769504, 3878330411, 2765298765
 target = sm4
 
 aarch64 = sm4e
@@ -6908,7 +6908,7 @@ generate uint32x4_t
 name = vrax1
 a = 1, 2
 b = 3, 4
-validate 5, 6
+validate 7, 10
 target = sha3
 
 aarch64 = rax1
@@ -6920,7 +6920,7 @@ name = vsha512h
 a = 1, 2
 b = 3, 4
 c = 5, 6
-validate 7, 8
+validate 11189044327219203, 7177611956453380
 target = sha3
 
 aarch64 = sha512h
@@ -6932,7 +6932,7 @@ name = vsha512h2
 a = 1, 2
 b = 3, 4
 c = 5, 6
-validate 7, 8
+validate 5770237651009406214, 349133864969
 target = sha3
 
 aarch64 = sha512h2
@@ -6943,7 +6943,7 @@ generate uint64x2_t
 name = vsha512su0
 a = 1, 2
 b = 3, 4
-validate 7, 8
+validate 144115188075855874, 9439544818968559619
 target = sha3
 
 aarch64 = sha512su0
@@ -6955,7 +6955,7 @@ name = vsha512su1
 a = 1, 2
 b = 3, 4
 c = 5, 6
-validate 7, 8
+validate 105553116266526, 140737488355368
 target = sha3
 
 aarch64 = sha512su1

From 01f3bc736c0c4e715920175ba70bd5af44e8bd23 Mon Sep 17 00:00:00 2001
From: SparrowLii <liyuan179@huawei.com>
Date: Wed, 10 Nov 2021 15:35:35 +0800
Subject: [PATCH 4/5] add `vrnd32x` `vrnd64x`

---
 .../core_arch/src/aarch64/neon/generated.rs   | 168 ++++++++++++++++++
 crates/stdarch-gen/neon.spec                  |  17 +-
 crates/stdarch-gen/src/main.rs                |  14 +-
 3 files changed, 183 insertions(+), 16 deletions(-)

diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
index abcf75f290..bdf6158bb2 100644
--- a/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/crates/core_arch/src/aarch64/neon/generated.rs
@@ -12096,6 +12096,110 @@ pub unsafe fn vsha512su1q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> ui
     vsha512su1q_u64_(a, b, c)
 }
 
+/// Floating-point round to 32-bit integer, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint32x))]
+pub unsafe fn vrnd32x_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint32x.v2f32")]
+        fn vrnd32x_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrnd32x_f32_(a)
+}
+
+/// Floating-point round to 32-bit integer, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint32x))]
+pub unsafe fn vrnd32xq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint32x.v4f32")]
+        fn vrnd32xq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrnd32xq_f32_(a)
+}
+
+/// Floating-point round to 32-bit integer toward zero
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint32z))]
+pub unsafe fn vrnd32z_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint32z.v2f32")]
+        fn vrnd32z_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrnd32z_f32_(a)
+}
+
+/// Floating-point round to 32-bit integer toward zero
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint32z))]
+pub unsafe fn vrnd32zq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint32z.v4f32")]
+        fn vrnd32zq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrnd32zq_f32_(a)
+}
+
+/// Floating-point round to 64-bit integer, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint64x))]
+pub unsafe fn vrnd64x_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint64x.v2f32")]
+        fn vrnd64x_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrnd64x_f32_(a)
+}
+
+/// Floating-point round to 64-bit integer, using current rounding mode
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint64x))]
+pub unsafe fn vrnd64xq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint64x.v4f32")]
+        fn vrnd64xq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrnd64xq_f32_(a)
+}
+
+/// Floating-point round to 64-bit integer toward zero
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint64z))]
+pub unsafe fn vrnd64z_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint64z.v2f32")]
+        fn vrnd64z_f32_(a: float32x2_t) -> float32x2_t;
+    }
+    vrnd64z_f32_(a)
+}
+
+/// Floating-point round to 64-bit integer toward zero
+#[inline]
+#[target_feature(enable = "neon,frintts")]
+#[cfg_attr(test, assert_instr(frint64z))]
+pub unsafe fn vrnd64zq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "unadjusted" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint64z.v4f32")]
+        fn vrnd64zq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+    vrnd64zq_f32_(a)
+}
+
 /// Transpose vectors
 #[inline]
 #[target_feature(enable = "neon")]
@@ -23147,6 +23251,70 @@ mod test {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd32x_f32() {
+        let a: f32x2 = f32x2::new(1.1, 1.9);
+        let e: f32x2 = f32x2::new(1.0, 2.0);
+        let r: f32x2 = transmute(vrnd32x_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd32xq_f32() {
+        let a: f32x4 = f32x4::new(1.1, 1.9, -1.7, -2.3);
+        let e: f32x4 = f32x4::new(1.0, 2.0, -2.0, -2.0);
+        let r: f32x4 = transmute(vrnd32xq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd32z_f32() {
+        let a: f32x2 = f32x2::new(1.1, 1.9);
+        let e: f32x2 = f32x2::new(1.0, 1.0);
+        let r: f32x2 = transmute(vrnd32z_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd32zq_f32() {
+        let a: f32x4 = f32x4::new(1.1, 1.9, -1.7, -2.3);
+        let e: f32x4 = f32x4::new(1.0, 1.0, -1.0, -2.0);
+        let r: f32x4 = transmute(vrnd32zq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd64x_f32() {
+        let a: f32x2 = f32x2::new(1.1, 1.9);
+        let e: f32x2 = f32x2::new(1.0, 2.0);
+        let r: f32x2 = transmute(vrnd64x_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd64xq_f32() {
+        let a: f32x4 = f32x4::new(1.1, 1.9, -1.7, -2.3);
+        let e: f32x4 = f32x4::new(1.0, 2.0, -2.0, -2.0);
+        let r: f32x4 = transmute(vrnd64xq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd64z_f32() {
+        let a: f32x2 = f32x2::new(1.1, 1.9);
+        let e: f32x2 = f32x2::new(1.0, 1.0);
+        let r: f32x2 = transmute(vrnd64z_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon,frintts")]
+    unsafe fn test_vrnd64zq_f32() {
+        let a: f32x4 = f32x4::new(1.1, 1.9, -1.7, -2.3);
+        let e: f32x4 = f32x4::new(1.0, 1.0, -1.0, -2.0);
+        let r: f32x4 = transmute(vrnd64zq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "neon")]
     unsafe fn test_vtrn1_s8() {
         let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14);
diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec
index be5ea6eb85..933d2bc41f 100644
--- a/crates/stdarch-gen/neon.spec
+++ b/crates/stdarch-gen/neon.spec
@@ -6966,42 +6966,41 @@ generate uint64x2_t
 name = vrnd32x
 a = 1.1, 1.9, -1.7, -2.3
 validate 1.0, 2.0, -2.0, -2.0
-target = v8.5a
+target = frintts
 
 aarch64 = frint32x
 link-aarch64 = frint32x._EXT_
-// v8.5a is no_runtime feature
-//generate float32x2_t, float32x4_t
+generate float32x2_t, float32x4_t
 
 /// Floating-point round to 32-bit integer toward zero
 name = vrnd32z
 a = 1.1, 1.9, -1.7, -2.3
 validate 1.0, 1.0, -1.0, -2.0
-target = v8.5a
+target = frintts
 
 aarch64 = frint32z
 link-aarch64 = frint32z._EXT_
-//generate float32x2_t, float32x4_t
+generate float32x2_t, float32x4_t
 
 /// Floating-point round to 64-bit integer, using current rounding mode
 name = vrnd64x
 a = 1.1, 1.9, -1.7, -2.3
 validate 1.0, 2.0, -2.0, -2.0
-target = v8.5a
+target = frintts
 
 aarch64 = frint64x
 link-aarch64 = frint64x._EXT_
-//generate float32x2_t, float32x4_t
+generate float32x2_t, float32x4_t
 
 /// Floating-point round to 64-bit integer toward zero
 name = vrnd64z
 a = 1.1, 1.9, -1.7, -2.3
 validate 1.0, 1.0, -1.0, -2.0
-target = v8.5a
+target = frintts
 
 aarch64 = frint64z
 link-aarch64 = frint64z._EXT_
-//generate float32x2_t, float32x4_t
+generate float32x2_t, float32x4_t
 
 /// Transpose elements
 name = vtrn
diff --git a/crates/stdarch-gen/src/main.rs b/crates/stdarch-gen/src/main.rs
index aa516f73fe..4ef3cb091d 100644
--- a/crates/stdarch-gen/src/main.rs
+++ b/crates/stdarch-gen/src/main.rs
@@ -467,7 +467,7 @@ enum TargetFeature {
     SHA3,
     RDM,
     SM4,
-    V85a,
+    FTTS,
 }
 
 #[derive(Clone, Copy)]
@@ -1074,7 +1074,7 @@ fn gen_aarch64(
         SHA3 => "neon,sha3",
         RDM => "rdm",
         SM4 => "neon,sm4",
-        V85a => "neon,v8.5a",
+        FTTS => "neon,frintts",
     };
     let current_fn = if let Some(current_fn) = current_fn.clone() {
         if link_aarch64.is_some() {
@@ -1389,7 +1389,7 @@ fn gen_aarch64(
         I8MM => "neon,i8mm",
         SM4 => "neon,sm4",
         SHA3 => "neon,sha3",
-        V85a => "neon,v8.5a",
+        FTTS => "neon,frintts",
         _ => "neon",
     };
     let test = match fn_type {
@@ -1795,7 +1795,7 @@ fn gen_arm(
         SHA3 => "neon,sha3",
         RDM => "rdm",
         SM4 => "neon,sm4",
-        V85a => "neon,v8.5a",
+        FTTS => "neon,frintts",
     };
     let current_target_arm = match target {
         Default => "v7",
@@ -1809,7 +1809,7 @@ fn gen_arm(
         RDM => unreachable!(),
         SM4 => unreachable!(),
         SHA3 => unreachable!(),
-        V85a => unreachable!(),
+        FTTS => unreachable!(),
     };
     let current_fn = if let Some(current_fn) = current_fn.clone() {
         if link_aarch64.is_some() || link_arm.is_some() {
@@ -2389,7 +2389,7 @@ fn gen_arm(
         I8MM => "neon,i8mm",
         SM4 => "neon,sm4",
         SHA3 => "neon,sha3",
-        V85a => "neon,v8.5a",
+        FTTS => "neon,frintts",
         _ => "neon",
     };
     let test = match fn_type {
@@ -3205,7 +3205,7 @@ mod test {
                     "sha3" => SHA3,
                     "rdm" => RDM,
                     "sm4" => SM4,
-                    "v8.5a" => V85a,
+                    "frintts" => FTTS,
                     _ => Default,
                 },
                 _ => Default,

From bc71db9dc330622f0091cc11594f542dd249672c Mon Sep 17 00:00:00 2001
From: SparrowLii <liyuan179@huawei.com>
Date: Wed, 10 Nov 2021 18:34:45 +0800
Subject: [PATCH 5/5] add MISSING.md

---
 crates/core_arch/MISSING.md | 194 ++++++++++++++++++++++++++++++++++++
 1 file changed, 194 insertions(+)
 create mode 100644 crates/core_arch/MISSING.md

diff --git a/crates/core_arch/MISSING.md b/crates/core_arch/MISSING.md
new file mode 100644
index 0000000000..99eb794a55
--- /dev/null
+++ b/crates/core_arch/MISSING.md
@@ -0,0 +1,194 @@
+## The following neon instructions are currently not implemented in stdarch
+
+### Can be implemented next:
+
+`vcls_u16`
+
+`vcls_u32`
+
+`vcls_u8`
+
+`vclsq_u16`
+
+`vclsq_u32`
+
+`vclsq_u8`
+
+`vcreate_s16`
+
+`vcreate_u16`
+
+`vpaddq_s64`
+
+`vpaddq_u64`
+
+`vreinterpretq_p128_f32`
+
+`vreinterpretq_p128_f64`
+
+`vreinterpretq_p128_p16`
+
+`vreinterpretq_p128_p8`
+
+`vreinterpretq_p128_s16`
+
+`vreinterpretq_p128_s32`
+
+`vreinterpretq_p128_s64`
+
+`vreinterpretq_p128_s8`
+
+`vreinterpretq_p128_u16`
+
+`vreinterpretq_p128_u32`
+
+`vreinterpretq_p128_u64`
+
+`vreinterpretq_p128_u8`
+
+`vslid_n_s64`
+
+`vslid_n_u64`
+
+`vsrid_n_s64`
+
+`vsrid_n_u64`
+
+### Not implemented on arm:
+
+`vcadd_rot270_f32`
+
+`vcadd_rot90_f32`
+
+`vcaddq_rot270_f32`
+
+`vcaddq_rot90_f32`
+
+`vdot_s32`
+
+`vdot_u32`
+
+`vdotq_s32`
+
+`vdotq_u32`
+
+`vdot_lane_s32`
+
+`vdot_lane_u32`
+
+`vdotq_lane_s32`
+
+`vdotq_lane_u32`
+
+`vcmla_f32`
+
+`vcmla_lane_f32`
+
+`vcmla_laneq_f32`
+
+`vcmla_rot180_f32`
+
+`vcmla_rot180_lane_f32`
+
+`vcmla_rot180_laneq_f32`
+
+`vcmla_rot270_f32`
+
+`vcmla_rot270_lane_f32`
+
+`vcmla_rot270_laneq_f32`
+
+`vcmla_rot90_f32`
+
+`vcmla_rot90_lane_f32`
+
+`vcmla_rot90_laneq_f32`
+
+`vcmlaq_f32`
+
+`vcmlaq_lane_f32`
+
+`vcmlaq_laneq_f32`
+
+`vcmlaq_rot180_f32`
+
+`vcmlaq_rot180_lane_f32`
+
+`vcmlaq_rot180_laneq_f32`
+
+`vcmlaq_rot270_f32`
+
+`vcmlaq_rot270_lane_f32`
+
+`vcmlaq_rot270_laneq_f32`
+
+`vcmlaq_rot90_f32`
+
+`vcmlaq_rot90_lane_f32`
+
+`vcmlaq_rot90_laneq_f32`
+
+### Not implemented in LLVM:
+
+`vrnd32x_f64`
+
+`vrnd32xq_f64`
+
+`vrnd32z_f64`
+
+`vrnd32zq_f64`
+
+`vrnd64x_f64`
+
+`vrnd64xq_f64`
+
+`vrnd64z_f64`
+
+`vrnd64zq_f64`
+
+### LLVM Select errors may occur:
+
+`vsudot_lane_s32`
+
+`vsudot_laneq_s32`
+
+`vsudotq_lane_s32`
+
+`vsudotq_laneq_s32`
+
+`vusdot_lane_s32`
+
+`vusdot_laneq_s32`
+
+`vusdot_s32`
+
+`vusdotq_lane_s32`
+
+`vusdotq_laneq_s32`
+
+`vusdotq_s32v`
+
+`vqshlu_n_s16`
+
+`vqshlu_n_s32`
+
+`vqshlu_n_s64`
+
+`vqshlu_n_s8`
+
+`vqshlub_n_s8`
+
+`vqshlud_n_s64`
+
+`vqshluh_n_s16`
+
+`vqshluq_n_s16`
+
+`vqshluq_n_s32`
+
+`vqshluq_n_s64`
+
+`vqshluq_n_s8`
+
+`vqshlus_n_s32`
+