From 4de58bf05e71e02ed3d1c790657d78b10b5f2d8d Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Wed, 17 May 2023 11:28:21 -0700 Subject: [PATCH] [mono][jit] Implement JIT support for the arm64 Crc and Dp intrinsics sets. (#86106) Also implement hw capacity detection for apple+arm64 platforms. --- src/mono/mono/arch/arm64/arm64-codegen.h | 14 ++++ src/mono/mono/arch/arm64/codegen-test.c | 10 +++ src/mono/mono/mini/cpu-arm64.mdesc | 4 ++ src/mono/mono/mini/mini-arm64.c | 81 +++++++++++++++++++++++- src/mono/mono/mini/mini-ops.h | 2 + src/mono/mono/mini/mini.c | 8 ++- src/mono/mono/mini/simd-intrinsics.c | 24 +++++-- src/mono/mono/utils/mono-hwcap-arm64.c | 54 ++++++++++++++++ src/mono/mono/utils/mono-hwcap-vars.h | 7 ++ 9 files changed, 196 insertions(+), 8 deletions(-) diff --git a/src/mono/mono/arch/arm64/arm64-codegen.h b/src/mono/mono/arch/arm64/arm64-codegen.h index 0a69031867854..154fd10d85f63 100644 --- a/src/mono/mono/arch/arm64/arm64-codegen.h +++ b/src/mono/mono/arch/arm64/arm64-codegen.h @@ -950,6 +950,20 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_autibsp(p) arm_format_autib ((p), 0b0011, 0b111) +/* CRC32 */ + +#define arm_format_crc32(p, sf, C, sz, rm, rn, rd) arm_emit ((p), ((sf) << 31) | (0b11010110 << 21) | (rm) << 16 | (0b010 << 13) | ((C) << 12) | ((sz) << 10) | ((rn) << 5) | ((rd) << 0)) + +#define arm_crc32b(p, rd, rn, rm) arm_format_crc32 ((p), 0, 0, 0b00, (rm), (rn), (rd)) +#define arm_crc32h(p, rd, rn, rm) arm_format_crc32 ((p), 0, 0, 0b01, (rm), (rn), (rd)) +#define arm_crc32w(p, rd, rn, rm) arm_format_crc32 ((p), 0, 0, 0b10, (rm), (rn), (rd)) +#define arm_crc32x(p, rd, rn, rm) arm_format_crc32 ((p), 1, 0, 0b11, (rm), (rn), (rd)) + +#define arm_crc32cb(p, rd, rn, rm) arm_format_crc32 ((p), 0, 1, 0b00, (rm), (rn), (rd)) +#define arm_crc32ch(p, rd, rn, rm) arm_format_crc32 ((p), 0, 1, 0b01, (rm), (rn), (rd)) +#define arm_crc32cw(p, rd, rn, rm) arm_format_crc32 ((p), 0, 1, 0b10, (rm), (rn), (rd)) +#define arm_crc32cx(p, rd, rn, rm) arm_format_crc32 ((p), 1, 1, 0b11, (rm), (rn), (rd)) + /* C4.1.69 NEON vector ISA */ // Opcode naming convention is arm_neon__[_] diff --git a/src/mono/mono/arch/arm64/codegen-test.c b/src/mono/mono/arch/arm64/codegen-test.c index b50a6432cf252..7644708aeb2b8 100644 --- a/src/mono/mono/arch/arm64/codegen-test.c +++ b/src/mono/mono/arch/arm64/codegen-test.c @@ -482,6 +482,16 @@ main (int argc, char *argv []) arm_neon_addp (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1, ARMREG_R2); arm_neon_faddp (code, VREG_FULL, TYPE_F32, ARMREG_R0, ARMREG_R1, ARMREG_R2); + // crc32 + arm_crc32b (code, ARMREG_R1, ARMREG_R2, ARMREG_R3); + arm_crc32h (code, ARMREG_R1, ARMREG_R2, ARMREG_R3); + arm_crc32w (code, ARMREG_R1, ARMREG_R2, ARMREG_R3); + arm_crc32x (code, ARMREG_R1, ARMREG_R2, ARMREG_R3); + arm_crc32cb (code, ARMREG_R1, ARMREG_R2, ARMREG_R3); + arm_crc32ch (code, ARMREG_R1, ARMREG_R2, ARMREG_R3); + arm_crc32cw (code, ARMREG_R1, ARMREG_R2, ARMREG_R3); + arm_crc32cx (code, ARMREG_R1, ARMREG_R2, ARMREG_R3); + for (i = 0; i < code - buf; ++i) printf (".byte %d\n", buf [i]); printf ("\n"); diff --git a/src/mono/mono/mini/cpu-arm64.mdesc b/src/mono/mono/mini/cpu-arm64.mdesc index 5d582da5a1bc8..57e31d0561eff 100644 --- a/src/mono/mono/mini/cpu-arm64.mdesc +++ b/src/mono/mono/mini/cpu-arm64.mdesc @@ -473,6 +473,9 @@ lscnt32: dest:i src1:i len:4 lscnt64: dest:i src1:i len:4 xop_i8_i8: dest:i src1:i len:4 xop_i4_i4: dest:i src1:i len:4 +xop_i4_i4_i4: dest:i src1:i src2:i len:4 +xop_i4_i4_i8: dest:i src1:i src2:i len:4 +xop_ovr_x_x_x_x: dest:x src1:x src2:x src3:x len:4 clob:1 arm64_smulh: dest:i src1:i src2:i len:4 arm64_umulh: dest:i src1:i src2:i len:4 arm64_hint: len:4 @@ -554,6 +557,7 @@ arm64_ushl: dest:x src1:x src2:x len:4 arm64_ext_imm: dest:x src1:x src2:x len:4 xinsert_i8: dest:x src1:x src2:i src3:i len:20 xinsert_r8: dest:x src1:x src2:f src3:i len:20 +arm64_broadcast_elem: dest:x src1:x len:16 generic_class_init: src1:a len:44 clob:c gc_safe_point: src1:i len:12 clob:c diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index f163a76f50508..c31af98b63b0d 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include "llvm-intrinsics-types.h" @@ -3835,6 +3836,28 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) } break; } + case OP_XOP_OVR_X_X_X_X: { + IntrinsicId iid = (IntrinsicId) ins->inst_c0; + g_assert (dreg == sreg1); + g_assert (mono_class_value_size (ins->klass, NULL) == 16); + switch (iid) { + case INTRINS_AARCH64_ADV_SIMD_SDOT: + arm_neon_sdot_4s (code, dreg, sreg2, sreg3); + break; + case INTRINS_AARCH64_ADV_SIMD_UDOT: + arm_neon_udot_4s (code, dreg, sreg2, sreg3); + break; + default: + g_assert_not_reached (); + break; + } + break; + } + case OP_ARM64_BROADCAST_ELEM: + arm_neon_smov (code, TYPE_I32, ARMREG_IP0, sreg1, ins->inst_c0); + arm_neon_dup_g_4s (code, dreg, ARMREG_IP0); + break; + case OP_XZERO: arm_neon_eor_16b (code, dreg, dreg, dreg); break; @@ -5383,7 +5406,46 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) g_assert (ins->inst_c0 == INTRINS_BITREVERSE_I32); arm_rbitw (code, dreg, sreg1); break; - + case OP_XOP_I4_I4_I4: { + switch (ins->inst_c0) { + case INTRINS_AARCH64_CRC32B: + arm_crc32b (code, dreg, sreg1, sreg2); + break; + case INTRINS_AARCH64_CRC32H: + arm_crc32h (code, dreg, sreg1, sreg2); + break; + case INTRINS_AARCH64_CRC32W: + arm_crc32w (code, dreg, sreg1, sreg2); + break; + case INTRINS_AARCH64_CRC32CB: + arm_crc32cb (code, dreg, sreg1, sreg2); + break; + case INTRINS_AARCH64_CRC32CH: + arm_crc32ch (code, dreg, sreg1, sreg2); + break; + case INTRINS_AARCH64_CRC32CW: + arm_crc32cw (code, dreg, sreg1, sreg2); + break; + default: + g_assert_not_reached (); + break; + } + break; + } + case OP_XOP_I4_I4_I8: { + switch (ins->inst_c0) { + case INTRINS_AARCH64_CRC32X: + arm_crc32x (code, dreg, sreg1, sreg2); + break; + case INTRINS_AARCH64_CRC32CX: + arm_crc32cx (code, dreg, sreg1, sreg2); + break; + default: + g_assert_not_reached (); + break; + } + break; + } case OP_ARM64_HINT: g_assert (ins->inst_c0 <= ARMHINT_SEVL); arm_hint (code, ins->inst_c0); @@ -6382,3 +6444,20 @@ mono_arm_emit_brx (guint8 *code, int reg) { return emit_brx (code, reg); } + +MonoCPUFeatures +mono_arch_get_cpu_features (void) +{ + guint64 features = MONO_CPU_INITED; + + if (mono_hwcap_arm64_has_crc32) + features |= MONO_CPU_ARM64_CRC; + if (mono_hwcap_arm64_has_dot) + features |= MONO_CPU_ARM64_DP; + if (mono_hwcap_arm64_has_rdm) + features |= MONO_CPU_ARM64_RDM; + if (mono_hwcap_arm64_has_sha1 && mono_hwcap_arm64_has_sha256 && mono_hwcap_arm64_has_aes) + features |= MONO_CPU_ARM64_CRYPTO; + + return features; +} diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index df1731f0e6801..120b884495bdc 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -1722,6 +1722,8 @@ MINI_OP(OP_ARM64_SQXTUN2, "arm64_sqxtun2", XREG, XREG, XREG) MINI_OP(OP_ARM64_SELECT_SCALAR, "arm64_select_scalar", XREG, XREG, IREG) MINI_OP(OP_ARM64_SELECT_QUAD, "arm64_select_quad", XREG, XREG, IREG) +/* Take a word elem of sreg1 identified by inst_c0 and broadcast it to all elements of dreg */ +MINI_OP(OP_ARM64_BROADCAST_ELEM, "arm64_broadcast_elem", XREG, XREG, NONE) MINI_OP(OP_ARM64_FCVTN, "arm64_fcvtn", XREG, XREG, NONE) MINI_OP(OP_ARM64_FCVTN2, "arm64_fcvtn2", XREG, XREG, XREG) diff --git a/src/mono/mono/mini/mini.c b/src/mono/mono/mini/mini.c index 89f6ebdb46701..95d121f522ebe 100644 --- a/src/mono/mono/mini/mini.c +++ b/src/mono/mono/mini/mini.c @@ -4455,9 +4455,11 @@ mini_get_cpu_features (MonoCompile* cfg) #if !defined(MONO_CROSS_COMPILE) if (!cfg->compile_aot || cfg->use_current_cpu) { // detect current CPU features if we are in JIT mode or AOT with use_current_cpu flag. -#if defined(ENABLE_LLVM) - features = mono_llvm_get_cpu_features (); // llvm has a nice built-in API to detect features -#elif defined(TARGET_AMD64) || defined(TARGET_X86) +#if defined(ENABLE_LLVM) && !(defined(TARGET_ARM64) && defined(TARGET_OSX)) + // llvm has a nice built-in API to detect features + // it is not implemented on some platforms like apple arm64 + features = mono_llvm_get_cpu_features (); +#elif defined(TARGET_AMD64) || defined(TARGET_X86) || defined(TARGET_ARM64) features = mono_arch_get_cpu_features (); #endif } diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 9de5495e07707..ac933551cb120 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -3589,8 +3589,8 @@ static const IntrinGroup supported_arm_intrinsics [] = { { "AdvSimd", MONO_CPU_ARM64_NEON, advsimd_methods, sizeof (advsimd_methods) }, { "Aes", MONO_CPU_ARM64_CRYPTO, crypto_aes_methods, sizeof (crypto_aes_methods) }, { "ArmBase", MONO_CPU_ARM64_BASE, armbase_methods, sizeof (armbase_methods), TRUE }, - { "Crc32", MONO_CPU_ARM64_CRC, crc32_methods, sizeof (crc32_methods) }, - { "Dp", MONO_CPU_ARM64_DP, dp_methods, sizeof (dp_methods) }, + { "Crc32", MONO_CPU_ARM64_CRC, crc32_methods, sizeof (crc32_methods), TRUE }, + { "Dp", MONO_CPU_ARM64_DP, dp_methods, sizeof (dp_methods), TRUE }, { "Rdm", MONO_CPU_ARM64_RDM, rdm_methods, sizeof (rdm_methods) }, { "Sha1", MONO_CPU_ARM64_CRYPTO, sha1_methods, sizeof (sha1_methods) }, { "Sha256", MONO_CPU_ARM64_CRYPTO, sha256_methods, sizeof (sha256_methods) }, @@ -3976,8 +3976,24 @@ emit_arm64_intrinsics ( MonoClass *quad_klass = mono_class_from_mono_type_internal (fsig->params [2]); gboolean is_unsigned = type_is_unsigned (fsig->ret); int iid = is_unsigned ? INTRINS_AARCH64_ADV_SIMD_UDOT : INTRINS_AARCH64_ADV_SIMD_SDOT; - MonoInst *quad = emit_simd_ins (cfg, arg_klass, OP_ARM64_SELECT_QUAD, args [2]->dreg, args [3]->dreg); - quad->data.op [1].klass = quad_klass; + + MonoInst *quad; + if (!COMPILE_LLVM (cfg)) { + if (mono_class_value_size (arg_klass, NULL) != 16 || mono_class_value_size (quad_klass, NULL) != 16) + return NULL; + // FIXME: The c# api has ConstantExpected(Max = (byte)(15)), but the hw only supports + // selecting one of the 4 32 bit words + if (args [3]->opcode != OP_ICONST || args [3]->inst_c0 < 0 || args [3]->inst_c0 > 3) { + // FIXME: Throw the right exception ? + mono_emit_jit_icall (cfg, mono_throw_platform_not_supported, NULL); + return NULL; + } + quad = emit_simd_ins (cfg, klass, OP_ARM64_BROADCAST_ELEM, args [2]->dreg, -1); + quad->inst_c0 = args [3]->inst_c0; + } else { + quad = emit_simd_ins (cfg, arg_klass, OP_ARM64_SELECT_QUAD, args [2]->dreg, args [3]->dreg); + quad->data.op [1].klass = quad_klass; + } MonoInst *ret = emit_simd_ins (cfg, ret_klass, OP_XOP_OVR_X_X_X_X, args [0]->dreg, args [1]->dreg); ret->sreg3 = quad->dreg; ret->inst_c0 = iid; diff --git a/src/mono/mono/utils/mono-hwcap-arm64.c b/src/mono/mono/utils/mono-hwcap-arm64.c index f5cddcf25dc11..f701d26c7511b 100644 --- a/src/mono/mono/utils/mono-hwcap-arm64.c +++ b/src/mono/mono/utils/mono-hwcap-arm64.c @@ -6,9 +6,63 @@ * Licensed under the MIT license. See LICENSE file in the project root for full license information. */ +#ifdef __APPLE__ +#include +#include +#endif + #include "mono/utils/mono-hwcap.h" void mono_hwcap_arch_init (void) { +#ifdef __APPLE__ + const char *prop; + guint val [16]; + size_t val_len; + int res; + + val_len = sizeof (val); + prop = "hw.optional.armv8_crc32"; + res = sysctlbyname (prop, val, &val_len, NULL, 0); + g_assert (res == 0); + g_assert (val_len == 4); + mono_hwcap_arm64_has_crc32 = *(int*)val; + + val_len = sizeof (val); + prop = "hw.optional.arm.FEAT_RDM"; + res = sysctlbyname (prop, val, &val_len, NULL, 0); + g_assert (res == 0); + g_assert (val_len == 4); + mono_hwcap_arm64_has_rdm = *(int*)val; + + val_len = sizeof (val); + prop = "hw.optional.arm.FEAT_DotProd"; + res = sysctlbyname (prop, val, &val_len, NULL, 0); + g_assert (res == 0); + g_assert (val_len == 4); + mono_hwcap_arm64_has_dot = *(int*)val; + + val_len = sizeof (val); + prop = "hw.optional.arm.FEAT_SHA1"; + res = sysctlbyname (prop, val, &val_len, NULL, 0); + g_assert (res == 0); + g_assert (val_len == 4); + mono_hwcap_arm64_has_sha1 = *(int*)val; + + val_len = sizeof (val); + prop = "hw.optional.arm.FEAT_SHA256"; + res = sysctlbyname (prop, val, &val_len, NULL, 0); + g_assert (res == 0); + g_assert (val_len == 4); + mono_hwcap_arm64_has_sha256 = *(int*)val; + + val_len = sizeof (val); + prop = "hw.optional.arm.FEAT_AES"; + res = sysctlbyname (prop, val, &val_len, NULL, 0); + g_assert (res == 0); + g_assert (val_len == 4); + mono_hwcap_arm64_has_aes = *(int*)val; + +#endif } diff --git a/src/mono/mono/utils/mono-hwcap-vars.h b/src/mono/mono/utils/mono-hwcap-vars.h index dcd192912198d..e972f7e36d1e1 100644 --- a/src/mono/mono/utils/mono-hwcap-vars.h +++ b/src/mono/mono/utils/mono-hwcap-vars.h @@ -17,6 +17,13 @@ MONO_HWCAP_VAR(arm_has_thumb2) #elif defined (TARGET_ARM64) +MONO_HWCAP_VAR(arm64_has_crc32) +MONO_HWCAP_VAR(arm64_has_dot) +MONO_HWCAP_VAR(arm64_has_rdm) +MONO_HWCAP_VAR(arm64_has_sha1) +MONO_HWCAP_VAR(arm64_has_sha256) +MONO_HWCAP_VAR(arm64_has_aes) + // Nothing here yet. #elif defined (TARGET_POWERPC) || defined (TARGET_POWERPC64)