From 13293530283fecd1a4894c75b46fd4aeeed10efe Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Tue, 6 Jun 2023 01:43:43 -0400 Subject: [PATCH 01/21] [mono][jit] Add JIT support for the methods in Vector128 on amd64. --- src/mono/mono/mini/cpu-amd64.mdesc | 7 ++ src/mono/mono/mini/mini-amd64.c | 130 +++++++++++++++++++--- src/mono/mono/mini/mini-ops.h | 3 + src/mono/mono/mini/mini.h | 8 ++ src/mono/mono/mini/simd-intrinsics.c | 156 ++++++++++++++++++++++----- 5 files changed, 267 insertions(+), 37 deletions(-) diff --git a/src/mono/mono/mini/cpu-amd64.mdesc b/src/mono/mono/mini/cpu-amd64.mdesc index f6eb2d52494ad..121212ce41b02 100644 --- a/src/mono/mono/mini/cpu-amd64.mdesc +++ b/src/mono/mono/mini/cpu-amd64.mdesc @@ -600,6 +600,13 @@ loadu2_mem: dest:i len:16 #SIMD +xbinop: dest:x src1:x src2:x len:7 clob:1 +xbinop_forceint: dest:x src1:x src2:x len:7 clob:1 +ones_complement: dest:x src1:x len:16 clob:1 +negate: dest:x src1:x len:24 clob:1 +xlower: dest:x src1:x len:7 clob:1 +xupper: dest:x src1:x len:7 clob:1 + addps: dest:x src1:x src2:x len:4 clob:1 divps: dest:x src1:x src2:x len:4 clob:1 mulps: dest:x src1:x src2:x len:4 clob:1 diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index 12e72cc29cd4d..e574987edf3dd 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -67,6 +67,7 @@ static gpointer bp_trampoline; /* Offset between fp and the first argument in the callee */ #define ARGS_OFFSET 16 #define GP_SCRATCH_REG AMD64_R11 +#define SIMD_TEMP_REG MONO_ARCH_FP_SCRATCH_REG /* Max number of bblocks before we bail from using more advanced branch placement code */ #define MAX_BBLOCKS_FOR_BRANCH_OPTS 800 @@ -4015,7 +4016,7 @@ mono_arch_lowering_pass (MonoCompile *cfg, MonoBasicBlock *bb) ins->opcode = ins->inst_c1 == MONO_TYPE_R8 ? OP_MAXPD : OP_MAXPS; break; default: - g_assert_not_reached(); + // Handled in mono_arch_output_basic_block () break; } break; @@ -6033,8 +6034,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) if (cfg->compile_aot && cfg->code_exec_only) { mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R8_GOT, &r8_0); amd64_mov_reg_membase (code, AMD64_R11, AMD64_RIP, 0, sizeof (target_mgreg_t)); - amd64_sse_movsd_reg_membase (code, MONO_ARCH_FP_SCRATCH_REG, AMD64_R11, 0); - amd64_sse_xorpd_reg_reg (code, ins->dreg, MONO_ARCH_FP_SCRATCH_REG); + amd64_sse_movsd_reg_membase (code, SIMD_TEMP_REG, AMD64_R11, 0); + amd64_sse_xorpd_reg_reg (code, ins->dreg, SIMD_TEMP_REG); } else { mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R8, &r8_0); amd64_sse_xorpd_reg_membase (code, ins->dreg, AMD64_RIP, 0); @@ -6049,8 +6050,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) if (cfg->compile_aot && cfg->code_exec_only) { mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R8_GOT, &d); amd64_mov_reg_membase (code, AMD64_R11, AMD64_RIP, 0, sizeof (target_mgreg_t)); - amd64_sse_movsd_reg_membase (code, MONO_ARCH_FP_SCRATCH_REG, AMD64_R11, 0); - amd64_sse_andpd_reg_reg (code, ins->dreg, MONO_ARCH_FP_SCRATCH_REG); + amd64_sse_movsd_reg_membase (code, SIMD_TEMP_REG, AMD64_R11, 0); + amd64_sse_andpd_reg_reg (code, ins->dreg, SIMD_TEMP_REG); } else { mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R8, &d); amd64_sse_andpd_reg_membase (code, ins->dreg, AMD64_RIP, 0); @@ -6081,13 +6082,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) if (cfg->compile_aot && cfg->code_exec_only) { mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R4_GOT, &r4_0); amd64_mov_reg_membase (code, AMD64_R11, AMD64_RIP, 0, sizeof (target_mgreg_t)); - amd64_sse_movss_reg_membase (code, MONO_ARCH_FP_SCRATCH_REG, AMD64_R11, 0); + amd64_sse_movss_reg_membase (code, SIMD_TEMP_REG, AMD64_R11, 0); } else { mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_R4, &r4_0); - amd64_sse_movss_reg_membase (code, MONO_ARCH_FP_SCRATCH_REG, AMD64_RIP, 0); + amd64_sse_movss_reg_membase (code, SIMD_TEMP_REG, AMD64_RIP, 0); } - amd64_sse_xorps_reg_reg (code, ins->dreg, MONO_ARCH_FP_SCRATCH_REG); + amd64_sse_xorps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); break; } @@ -6674,6 +6675,109 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) } #ifdef MONO_ARCH_SIMD_INTRINSICS /* TODO: Some of these IR opcodes are marked as no clobber when they indeed do. */ + case OP_XBINOP: { + switch (ins->inst_c0) { + case OP_IMUL: + switch (ins->inst_c1) { + case MONO_TYPE_U4: + amd64_sse_pmuludq_reg_reg (code, ins->sreg1, ins->sreg2); + break; + default: + g_assert_not_reached (); + break; + } + break; + default: + g_assert_not_reached (); + break; + } + break; + } + case OP_XBINOP_FORCEINT: { + switch (ins->inst_c0) { + case XBINOP_FORCEINT_AND: + amd64_sse_andpd_reg_reg (code, ins->sreg1, ins->sreg2); + break; + case XBINOP_FORCEINT_OR: + amd64_sse_orpd_reg_reg (code, ins->sreg1, ins->sreg2); + break; + case XBINOP_FORCEINT_XOR: + amd64_sse_xorpd_reg_reg (code, ins->sreg1, ins->sreg2); + break; + default: + g_assert_not_reached (); + break; + } + break; + } + case OP_ONES_COMPLEMENT: + amd64_sse_pcmpeqd_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_sse_pxor_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; + case OP_NEGATION: { + switch (ins->inst_c1) { + case MONO_TYPE_I1: + case MONO_TYPE_U1: + amd64_sse_pxor_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_sse_psubb_reg_reg (code, SIMD_TEMP_REG, ins->sreg1); + amd64_sse_movaps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; + case MONO_TYPE_I2: + case MONO_TYPE_U2: + amd64_sse_pxor_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_sse_psubw_reg_reg (code, SIMD_TEMP_REG, ins->sreg1); + amd64_sse_movaps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; + case MONO_TYPE_I4: + case MONO_TYPE_U4: + amd64_sse_pxor_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_sse_psubd_reg_reg (code, SIMD_TEMP_REG, ins->sreg1); + amd64_sse_movaps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; + case MONO_TYPE_I8: + case MONO_TYPE_U8: + amd64_sse_pxor_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_sse_psubq_reg_reg (code, SIMD_TEMP_REG, ins->sreg1); + amd64_sse_movaps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; + case MONO_TYPE_R4: { + static float r8_0 [] = {-0.0, -0.0, -0.0, -0.0 }; + + if (cfg->compile_aot && cfg->code_exec_only) { + mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128_GOT, &r8_0); + amd64_mov_reg_membase (code, AMD64_R11, AMD64_RIP, 0, sizeof(target_mgreg_t)); + amd64_sse_movups_reg_membase (code, SIMD_TEMP_REG, AMD64_R11, 0); + } else { + mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128, &r8_0); + amd64_sse_movups_reg_membase (code, SIMD_TEMP_REG, AMD64_RIP, 0); + } + + g_assert (ins->sreg1 == ins->dreg); + amd64_sse_xorps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; + } + case MONO_TYPE_R8: { + static double r8_0 [] = {-0.0, -0.0 }; + + if (cfg->compile_aot && cfg->code_exec_only) { + mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128_GOT, &r8_0); + amd64_mov_reg_membase (code, AMD64_R11, AMD64_RIP, 0, sizeof(target_mgreg_t)); + amd64_sse_movups_reg_membase (code, SIMD_TEMP_REG, AMD64_R11, 0); + } else { + mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128, &r8_0); + amd64_sse_movups_reg_membase (code, SIMD_TEMP_REG, AMD64_RIP, 0); + } + + g_assert (ins->sreg1 == ins->dreg); + amd64_sse_xorps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; + } + default: + g_assert_not_reached (); + break; + } + break; + } case OP_ADDPS: amd64_sse_addps_reg_reg (code, ins->sreg1, ins->sreg2); break; @@ -7114,8 +7218,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) break; case OP_EXTRACT_I8: if (ins->inst_c0) { - amd64_movhlps_reg_reg (code, MONO_ARCH_FP_SCRATCH_REG, ins->sreg1); - amd64_movd_reg_xreg_size (code, ins->dreg, MONO_ARCH_FP_SCRATCH_REG, 8); + amd64_movhlps_reg_reg (code, SIMD_TEMP_REG, ins->sreg1); + amd64_movd_reg_xreg_size (code, ins->dreg, SIMD_TEMP_REG, 8); } else { amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 8); } @@ -7165,11 +7269,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) amd64_sse_pinsrw_reg_reg_imm (code, ins->dreg, ins->sreg2, ins->inst_c0 * 2 + 1); break; case OP_INSERTX_I8_SLOW: - amd64_movd_xreg_reg_size(code, MONO_ARCH_FP_SCRATCH_REG, ins->sreg2, 8); + amd64_movd_xreg_reg_size(code, SIMD_TEMP_REG, ins->sreg2, 8); if (ins->inst_c0) - amd64_movlhps_reg_reg (code, ins->dreg, MONO_ARCH_FP_SCRATCH_REG); + amd64_movlhps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); else - amd64_sse_movsd_reg_reg (code, ins->dreg, MONO_ARCH_FP_SCRATCH_REG); + amd64_sse_movsd_reg_reg (code, ins->dreg, SIMD_TEMP_REG); break; case OP_INSERTX_R4_SLOW: diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index b5bc5579b137f..09113f021fa99 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -1513,7 +1513,9 @@ MINI_OP(OP_XEXTRACT, "xextract", IREG, XREG, NONE) * Generic SIMD operations, the rest of the JIT doesn't care about the exact operation. */ MINI_OP(OP_XUNOP, "xunop", XREG, XREG, NONE) +/* inst_c0 is a OP_ constant, inst_c1 is a MONO_TYPE_ constant */ MINI_OP(OP_XBINOP, "xbinop", XREG, XREG, XREG) +/* The arguments are treated as vectors of integer types. inst_c0 is a XBINOP_FORCEINT_ constant */ MINI_OP(OP_XBINOP_FORCEINT, "xbinop_forceint", XREG, XREG, XREG) MINI_OP(OP_XBINOP_SCALAR, "xbinop_scalar", XREG, XREG, XREG) MINI_OP(OP_XBINOP_BYSCALAR, "xbinop_byscalar", XREG, XREG, XREG) @@ -1827,6 +1829,7 @@ MINI_OP(OP_CVT_UI_FP, "convert_ui_to_fp", XREG, XREG, NONE) MINI_OP(OP_CVT_SI_FP, "convert_si_to_fp", XREG, XREG, NONE) MINI_OP(OP_CVT_UI_FP_SCALAR, "convert_ui_to_fp_scalar", XREG, XREG, NONE) MINI_OP(OP_CVT_SI_FP_SCALAR, "convert_si_to_fp_scalar", XREG, XREG, NONE) +/* inst_c1 is one of the MONO_TYPE_ constants */ MINI_OP(OP_NEGATION, "negate", XREG, XREG, NONE) MINI_OP(OP_NEGATION_SCALAR, "negate_scalar", XREG, XREG, NONE) MINI_OP3(OP_BSL, "bitwise_select", XREG, XREG, XREG, XREG) diff --git a/src/mono/mono/mini/mini.h b/src/mono/mono/mini/mini.h index 9c5f7e9e84ce6..4761f81824444 100644 --- a/src/mono/mono/mini/mini.h +++ b/src/mono/mono/mini/mini.h @@ -2980,6 +2980,14 @@ mini_class_is_simd (MonoCompile *cfg, MonoClass *klass) #ifdef TARGET_ARM64 if (size == 8 || size == 16) return TRUE; +#elif defined(TARGET_AMD64) + +#ifdef TARGET_WIN32 + return FALSE; +#endif + if (size == 16) + return TRUE; + #else if (size == 16) return TRUE; diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index eb208bb3a6a7b..2ec25acfa4e8d 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -389,6 +389,10 @@ emit_simd_ins_for_binary_op (MonoCompile *cfg, MonoClass *klass, MonoMethodSigna #ifdef TARGET_ARM64 if (!COMPILE_LLVM (cfg) && (arg_type == MONO_TYPE_I8 || arg_type == MONO_TYPE_U8 || arg_type == MONO_TYPE_I || arg_type == MONO_TYPE_U)) return NULL; +#endif +#ifdef TARGET_AMD64 + if (!COMPILE_LLVM (cfg)) + return NULL; #endif if (fsig->params [1]->type != MONO_TYPE_GENERICINST) return handle_mul_div_by_scalar (cfg, klass, arg_type, args [1]->dreg, args [0]->dreg, OP_IMUL); @@ -1009,6 +1013,8 @@ emit_hardware_intrinsics ( if (id == SN_get_IsSupported) { MonoInst *ins = NULL; EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0); + if (cfg->verbose_level > 1) + g_printf ("\t-> %s\n", supported ? "true" : " false"); return ins; } if (!supported) { @@ -1332,15 +1338,14 @@ emit_msb_shift_vector_constant (MonoCompile *cfg, MonoClass *arg_class, MonoType return msb_shift_vec; } -/* Emit intrinsics in System.Numerics.Vector and System.Runtime.Intrinsics.Vector64/128/256/512 */ +/* + * Emit intrinsics in System.Numerics.Vector and System.Runtime.Intrinsics.Vector64/128/256/512. + * If the intrinsic is not supported for some reasons, return NULL, and fall back to the c# + * implementation. + */ static MonoInst* emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args) { -#if defined(TARGET_AMD64) || defined(TARGET_WASM) - if (!COMPILE_LLVM (cfg)) - return NULL; -#endif - int id = lookup_intrins (sri_vector_methods, sizeof (sri_vector_methods), cmethod); if (id == -1) { //check_no_intrinsic_cattr (cmethod); @@ -1379,6 +1384,92 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } #endif +#ifdef TARGET_WASM + g_assert (COMPILE_LLVM (cfg)); +#endif + +#ifdef TARGET_AMD64 + if (!COMPILE_LLVM (cfg)) { + if (vector_size != 128) + return NULL; +#ifdef TARGET_WIN32 + return NULL; +#endif + switch (id) { + case SN_Abs: + case SN_Add: + case SN_AndNot: + case SN_As: + case SN_AsByte: + case SN_AsDouble: + case SN_AsInt16: + case SN_AsInt32: + case SN_AsInt64: + case SN_AsSByte: + case SN_AsSingle: + case SN_AsUInt16: + case SN_AsUInt32: + case SN_AsUInt64: + case SN_BitwiseAnd: + case SN_BitwiseOr: + case SN_Ceiling: + case SN_ConditionalSelect: + case SN_ConvertToDouble: + case SN_ConvertToInt32: + case SN_ConvertToInt64: + case SN_ConvertToSingle: + case SN_ConvertToUInt32: + case SN_ConvertToUInt64: + case SN_Create: + case SN_CreateScalar: + case SN_CreateScalarUnsafe: + case SN_Divide: + case SN_Dot: + case SN_Equals: + case SN_EqualsAll: + case SN_EqualsAny: + case SN_ExtractMostSignificantBits: + case SN_Floor: + case SN_GetElement: + case SN_GetLower: + case SN_GetUpper: + case SN_GreaterThan: + case SN_GreaterThanAll: + case SN_GreaterThanAny: + case SN_GreaterThanOrEqual: + case SN_GreaterThanOrEqualAll: + case SN_GreaterThanOrEqualAny: + case SN_LessThan: + case SN_LessThanAll: + case SN_LessThanAny: + case SN_LessThanOrEqual: + case SN_LessThanOrEqualAll: + case SN_LessThanOrEqualAny: + case SN_Max: + case SN_Min: + case SN_Multiply: + case SN_Narrow: + case SN_Negate: + case SN_OnesComplement: + case SN_Shuffle: + case SN_Sqrt: + case SN_Subtract: + case SN_Sum: + case SN_ToScalar: + case SN_ToVector128: + case SN_ToVector128Unsafe: + case SN_WidenLower: + case SN_WidenUpper: + case SN_WithElement: + case SN_Xor: + case SN_get_IsHardwareAccelerated: + return NULL; + default: + break; + } + } +#endif + MonoClass* klass = fsig->param_count > 0 ? args[0]->klass : cmethod->klass; MonoTypeEnum arg0_type = fsig->param_count > 0 ? get_underlying_type (fsig->params [0]) : MONO_TYPE_VOID; @@ -2219,6 +2310,7 @@ static guint16 vector64_vector128_t_methods [] = { SN_op_UnaryPlus, }; +/* Emit intrinsics in System.Runtime.Intrinsics.Vector64/128/256/512 */ static MonoInst* emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsig, MonoInst **args) { @@ -2230,6 +2322,7 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign MonoClass *klass = cmethod->klass; MonoType *etype = mono_class_get_context (klass)->class_inst->type_argv [0]; + gboolean supported = TRUE; if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype)) return NULL; @@ -2247,26 +2340,37 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign g_free (name); } +#if defined(TARGET_WASM) + if (!COMPILE_LLVM (cfg)) + supported = FALSE; +#endif + +// FIXME: Support Vector64 for mini JIT on arm64 +#ifdef TARGET_ARM64 + if (!COMPILE_LLVM (cfg) && (size != 16)) + supported = FALSE; +#endif + +#ifdef TARGET_AMD64 + if (!COMPILE_LLVM (cfg) && (size != 16)) + supported = FALSE; +#ifdef TARGET_WIN32 + supported = FALSE; +#endif +#endif + switch (id) { case SN_get_IsSupported: { MonoInst *ins = NULL; - EMIT_NEW_ICONST (cfg, ins, 1); + EMIT_NEW_ICONST (cfg, ins, supported ? 1 : 0); return ins; } default: break; } -#if defined(TARGET_AMD64) || defined(TARGET_WASM) - if (!COMPILE_LLVM (cfg)) + if (!supported) return NULL; -#endif - -// FIXME: Support Vector64 for mini JIT on arm64 -#ifdef TARGET_ARM64 - if (!COMPILE_LLVM (cfg) && (size != 16)) - return NULL; -#endif switch (id) { case SN_get_Count: { @@ -2283,12 +2387,16 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign return emit_xones (cfg, klass); } case SN_get_One: { - if (size != 16) - return NULL; + guint64 buf [8]; + + /* For Vector64, the upper elements are 0 */ + g_assert (sizeof (buf) >= size); + memset (buf, 0, sizeof (buf)); + switch (etype->type) { case MONO_TYPE_I1: case MONO_TYPE_U1: { - guint8 value[16]; + guint8 *value = (guint8*)buf; for (int i = 0; i < len; ++i) { value [i] = 1; @@ -2298,7 +2406,7 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign } case MONO_TYPE_I2: case MONO_TYPE_U2: { - guint16 value[8]; + guint16 *value = (guint16*)buf; for (int i = 0; i < len; ++i) { value [i] = 1; @@ -2312,7 +2420,7 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign #endif case MONO_TYPE_I4: case MONO_TYPE_U4: { - guint32 value[4]; + guint32 *value = (guint32*)buf; for (int i = 0; i < len; ++i) { value [i] = 1; @@ -2326,7 +2434,7 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign #endif case MONO_TYPE_I8: case MONO_TYPE_U8: { - guint64 value[2]; + guint64 *value = (guint64*)buf; for (int i = 0; i < len; ++i) { value [i] = 1; @@ -2335,7 +2443,7 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign return emit_xconst_v128 (cfg, klass, (guint8*)value); } case MONO_TYPE_R4: { - float value[4]; + float *value = (float*)buf; for (int i = 0; i < len; ++i) { value [i] = 1.0f; @@ -2344,7 +2452,7 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign return emit_xconst_v128 (cfg, klass, (guint8*)value); } case MONO_TYPE_R8: { - double value[2]; + double *value = (double*)buf; for (int i = 0; i < len; ++i) { value [i] = 1.0; From 388fca7ef2089c3a3c6ee54d597aa7677058cbb8 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Tue, 6 Jun 2023 01:44:23 -0400 Subject: [PATCH 02/21] Avoid emitting OP_NOT_NULL opcodes in non-llvm mode, they are not used, and they keep their inputs alive. --- src/mono/mono/mini/ir-emit.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mono/mono/mini/ir-emit.h b/src/mono/mono/mini/ir-emit.h index 6c0b84213eb49..6b88e959e3852 100644 --- a/src/mono/mono/mini/ir-emit.h +++ b/src/mono/mono/mini/ir-emit.h @@ -886,7 +886,7 @@ static int ccount = 0; cfg->flags |= MONO_CFG_HAS_CHECK_THIS; \ MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, (reg), 0); \ MONO_EMIT_NEW_COND_EXC (cfg, EQ, "NullReferenceException"); \ - MONO_EMIT_NEW_UNALU (cfg, OP_NOT_NULL, -1, reg); \ + if (COMPILE_LLVM (cfg)) MONO_EMIT_NEW_UNALU (cfg, OP_NOT_NULL, -1, reg); \ } while (0) /* Emit an explicit null check which doesn't depend on SIGSEGV signal handling */ @@ -897,7 +897,7 @@ static int ccount = 0; } else { \ MONO_EMIT_NEW_IMPLICIT_EXCEPTION_LOAD_STORE (cfg); \ } \ - MONO_EMIT_NEW_UNALU (cfg, OP_NOT_NULL, -1, reg); \ + if (COMPILE_LLVM (cfg)) MONO_EMIT_NEW_UNALU (cfg, OP_NOT_NULL, -1, reg); \ } while (0) #define MONO_EMIT_NEW_CHECK_THIS(cfg, sreg) do { \ @@ -907,7 +907,7 @@ static int ccount = 0; } else { \ MONO_EMIT_NEW_UNALU (cfg, OP_CHECK_THIS, -1, sreg); \ MONO_EMIT_NEW_IMPLICIT_EXCEPTION_LOAD_STORE (cfg); \ - MONO_EMIT_NEW_UNALU (cfg, OP_NOT_NULL, -1, sreg); \ + if (COMPILE_LLVM (cfg)) MONO_EMIT_NEW_UNALU (cfg, OP_NOT_NULL, -1, sreg); \ } \ } while (0) From 0ccc5da28339019375c855166903e464a184bab7 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Wed, 7 Jun 2023 02:39:47 -0400 Subject: [PATCH 03/21] Return true for IsSupported for non 128 bit types even if the operations are not supported. --- src/mono/mono/mini/simd-intrinsics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 2ec25acfa4e8d..4c0253a523d43 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -2348,7 +2348,7 @@ emit_vector64_vector128_t (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSign // FIXME: Support Vector64 for mini JIT on arm64 #ifdef TARGET_ARM64 if (!COMPILE_LLVM (cfg) && (size != 16)) - supported = FALSE; + return NULL; #endif #ifdef TARGET_AMD64 From 36b5d8ac594f17b331a1ae063f65b3c69b917dd8 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Wed, 7 Jun 2023 09:53:25 -0400 Subject: [PATCH 04/21] Fix windows support. --- src/mono/mono/mini/mini.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/mono/mono/mini/mini.h b/src/mono/mono/mini/mini.h index 4761f81824444..9c5f7e9e84ce6 100644 --- a/src/mono/mono/mini/mini.h +++ b/src/mono/mono/mini/mini.h @@ -2980,14 +2980,6 @@ mini_class_is_simd (MonoCompile *cfg, MonoClass *klass) #ifdef TARGET_ARM64 if (size == 8 || size == 16) return TRUE; -#elif defined(TARGET_AMD64) - -#ifdef TARGET_WIN32 - return FALSE; -#endif - if (size == 16) - return TRUE; - #else if (size == 16) return TRUE; From fbbc42599d7eba4efb76eba2c75cc063c73f7468 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Wed, 7 Jun 2023 15:41:50 -0400 Subject: [PATCH 05/21] Add support for some binary operations. --- src/mono/mono/arch/amd64/amd64-codegen.h | 9 ++++ src/mono/mono/mini/cpu-amd64.mdesc | 4 ++ src/mono/mono/mini/mini-amd64.c | 32 +++++++++++++++ src/mono/mono/mini/mini-ops.h | 3 ++ src/mono/mono/mini/simd-intrinsics.c | 52 +++++++++++++++--------- 5 files changed, 80 insertions(+), 20 deletions(-) diff --git a/src/mono/mono/arch/amd64/amd64-codegen.h b/src/mono/mono/arch/amd64/amd64-codegen.h index 9ac73b9853c46..6f90941265cb2 100644 --- a/src/mono/mono/arch/amd64/amd64-codegen.h +++ b/src/mono/mono/arch/amd64/amd64-codegen.h @@ -894,6 +894,8 @@ typedef union { #define amd64_sse_shufpd_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm((inst), (dreg), (reg), 0x66, 0x0f, 0xC6, (imm)) +#define amd64_sse_roundps_reg_reg_imm(inst, dreg, reg, imm) emit_sse_reg_reg_op4_imm((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x08, (imm)) + #define amd64_sse_roundpd_reg_reg_imm(inst, dreg, reg, imm) emit_sse_reg_reg_op4_imm((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x09, (imm)) #define amd64_sse_addpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0x66, 0x0f, 0x58) @@ -1169,6 +1171,13 @@ typedef union { #define amd64_sse_lzcnt_reg_reg_size(inst, dreg, reg, size) emit_sse_reg_reg_size((inst), (dreg), (reg), 0xf3, 0x0f, 0xbd, (size)) #define amd64_sse_popcnt_reg_reg_size(inst, dreg, reg, size) emit_sse_reg_reg_size((inst), (dreg), (reg), 0xf3, 0x0f, 0xb8, (size)) +#define amd64_sse_psrlq_reg_imm(inst, reg, imm) emit_sse_reg_reg_imm((inst), X86_SSE_SHR, (reg), 0x66, 0x0f, 0x73, (imm)) +#define amd64_sse_dpps_reg_reg(inst, dreg, sreg, mask) emit_sse_reg_reg_op4_imm((inst), (dreg), (sreg), 0x66, 0x0f, 0x3a, 0x40, (mask)) +#define amd64_sse_dppd_reg_reg(inst, dreg, sreg, mask) emit_sse_reg_reg_op4_imm((inst), (dreg), (sreg), 0x66, 0x0f, 0x3a, 0x41, (mask)) +#define amd64_sse_phaddw_reg_reg(inst, dreg, sreg) emit_sse_reg_reg_op4((inst), (dreg), (sreg), 0x66, 0x0f, 0x38, 0x01) +#define amd64_sse_phaddd_reg_reg(inst, dreg, sreg) emit_sse_reg_reg_op4((inst), (dreg), (sreg), 0x66, 0x0f, 0x38, 0x02) +#define amd64_movq_reg_reg(inst,dreg,sreg) emit_sse_reg_reg ((inst), (dreg), (sreg), 0xf3, 0x0f, 0x7e) + /* Generated from x86-codegen.h */ #define amd64_breakpoint_size(inst,size) do { x86_breakpoint(inst); } while (0) diff --git a/src/mono/mono/mini/cpu-amd64.mdesc b/src/mono/mono/mini/cpu-amd64.mdesc index 121212ce41b02..c7b8867e9851c 100644 --- a/src/mono/mono/mini/cpu-amd64.mdesc +++ b/src/mono/mono/mini/cpu-amd64.mdesc @@ -786,6 +786,7 @@ extract_i8: dest:i src1:x len:9 extract_i2: dest:i src1:x len:13 extract_i1: dest:i src1:x len:13 extract_r8: dest:f src1:x len:5 +extract_r4: dest:f src1:x len:5 iconv_to_r4_raw: dest:f src1:i len:10 @@ -817,6 +818,9 @@ expand_i4: dest:x src1:i len:11 expand_i8: dest:x src1:i len:11 expand_r4: dest:x src1:f len:16 expand_r8: dest:x src1:f len:13 +xop_x_x_x: dest:x src1:x src2:x len:16 clob:1 +sse41_dpps_imm: dest:x src1:x src2:x len:7 clob:1 +sse41_dppd_imm: dest:x src1:x src2:x len:7 clob:1 roundp: dest:x src1:x len:10 diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index e574987edf3dd..65fbaab33e299 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -48,6 +48,7 @@ #include "mini-gc.h" #include "mini-runtime.h" #include "aot-runtime.h" +#include "llvm-intrinsics-types.h" MONO_DISABLE_WARNING(4127) /* conditional expression is constant */ @@ -6679,6 +6680,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) switch (ins->inst_c0) { case OP_IMUL: switch (ins->inst_c1) { + case MONO_TYPE_I4: case MONO_TYPE_U4: amd64_sse_pmuludq_reg_reg (code, ins->sreg1, ins->sreg2); break; @@ -6710,6 +6712,32 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) } break; } + case OP_XOP_X_X_X: { + switch (ins->inst_c0) { + case INTRINS_SSE_PHADDW: + amd64_sse_phaddw_reg_reg (code, ins->dreg, ins->sreg2); + break; + case INTRINS_SSE_PHADDD: + amd64_sse_phaddd_reg_reg (code, ins->dreg, ins->sreg2); + break; + case INTRINS_SSE_HADDPS: + amd64_sse_haddps_reg_reg (code, ins->dreg, ins->sreg2); + break; + case INTRINS_SSE_HADDPD: + amd64_sse_haddpd_reg_reg (code, ins->dreg, ins->sreg2); + break; + default: + g_assert_not_reached (); + break; + } + break; + } + case OP_SSE41_DPPS_IMM: + amd64_sse_dpps_reg_reg (code, ins->dreg, ins->sreg2, ins->inst_c0); + break; + case OP_SSE41_DPPD_IMM: + amd64_sse_dppd_reg_reg (code, ins->dreg, ins->sreg2, ins->inst_c0); + break; case OP_ONES_COMPLEMENT: amd64_sse_pcmpeqd_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); amd64_sse_pxor_reg_reg (code, ins->dreg, SIMD_TEMP_REG); @@ -7243,6 +7271,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) else amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1); break; + case OP_EXTRACT_R4: + g_assert (!ins->inst_c0); + amd64_sse_movss_reg_reg (code, ins->dreg, ins->sreg1); + break; case OP_INSERT_I2: amd64_sse_pinsrw_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0); break; diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index 09113f021fa99..9d579ffb22b8b 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -1145,6 +1145,9 @@ MINI_OP(OP_SSE_CVTII, "sse_cvtii", XREG, XREG, NONE) MINI_OP3(OP_SSE41_DPPS, "sse41_dpps", XREG, XREG, XREG, IREG) MINI_OP3(OP_SSE41_DPPD, "sse41_dppd", XREG, XREG, XREG, IREG) MINI_OP3(OP_SSE41_MPSADBW, "sse41_mpsadbw", XREG, XREG, XREG, IREG) +/* inst_c0 contains the mask value */ +MINI_OP(OP_SSE41_DPPS_IMM, "sse41_dpps_imm", XREG, XREG, XREG) +MINI_OP(OP_SSE41_DPPD_IMM, "sse41_dppd_imm", XREG, XREG, XREG) /* pclmulqdq */ MINI_OP3(OP_PCLMULQDQ, "pclmulqdq", XREG, XREG, XREG, IREG) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 4c0253a523d43..119ff7760d7c0 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -380,9 +380,17 @@ emit_simd_ins_for_binary_op (MonoCompile *cfg, MonoClass *klass, MonoMethodSigna return NULL; case SN_Max: instc0 = type_enum_is_unsigned (arg_type) ? OP_IMAX_UN : OP_IMAX; +#ifdef TARGET_AMD64 + if (!COMPILE_LLVM (cfg) && instc0 == OP_IMAX_UN) + return NULL; +#endif break; case SN_Min: instc0 = type_enum_is_unsigned (arg_type) ? OP_IMIN_UN : OP_IMIN; +#ifdef TARGET_AMD64 + if (!COMPILE_LLVM (cfg) && instc0 == OP_IMIN_UN) + return NULL; +#endif break; case SN_Multiply: case SN_op_Multiply: { @@ -1397,7 +1405,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi #endif switch (id) { case SN_Abs: - case SN_Add: case SN_AndNot: case SN_As: case SN_AsByte: @@ -1410,8 +1417,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_AsUInt16: case SN_AsUInt32: case SN_AsUInt64: - case SN_BitwiseAnd: - case SN_BitwiseOr: case SN_Ceiling: case SN_ConditionalSelect: case SN_ConvertToDouble: @@ -1423,8 +1428,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_Create: case SN_CreateScalar: case SN_CreateScalarUnsafe: - case SN_Divide: - case SN_Dot: case SN_Equals: case SN_EqualsAll: case SN_EqualsAny: @@ -1445,15 +1448,11 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_LessThanOrEqual: case SN_LessThanOrEqualAll: case SN_LessThanOrEqualAny: - case SN_Max: - case SN_Min: - case SN_Multiply: case SN_Narrow: case SN_Negate: case SN_OnesComplement: case SN_Shuffle: case SN_Sqrt: - case SN_Subtract: case SN_Sum: case SN_ToScalar: case SN_ToVector128: @@ -1461,7 +1460,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_WidenLower: case SN_WidenUpper: case SN_WithElement: - case SN_Xor: case SN_get_IsHardwareAccelerated: return NULL; default: @@ -1737,21 +1735,31 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi int instc =-1; if (type_enum_is_float (arg0_type)) { if (is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSE41)) { - int mask_reg = alloc_ireg (cfg); + int mask_val = -1; switch (arg0_type) { - case MONO_TYPE_R4: - instc = OP_SSE41_DPPS; - MONO_EMIT_NEW_ICONST (cfg, mask_reg, 0xf1); // 0xf1 ... 0b11110001 + case MONO_TYPE_R4: + instc = COMPILE_LLVM (cfg) ? OP_SSE41_DPPS : OP_SSE41_DPPS_IMM; + mask_val = 0xf1; // 0xf1 ... 0b11110001 + break; + case MONO_TYPE_R8: + instc = COMPILE_LLVM (cfg) ? OP_SSE41_DPPD : OP_SSE41_DPPD_IMM; + mask_val = 0x31; // 0x31 ... 0b00110001 break; - case MONO_TYPE_R8: - instc = OP_SSE41_DPPD; - MONO_EMIT_NEW_ICONST (cfg, mask_reg, 0x31); // 0x31 ... 0b00110001 - break; default: return NULL; } - MonoInst *dot = emit_simd_ins (cfg, klass, instc, args [0]->dreg, args [1]->dreg); - dot->sreg3 = mask_reg; + + MonoInst *dot; + if (COMPILE_LLVM (cfg)) { + int mask_reg = alloc_ireg (cfg); + MONO_EMIT_NEW_ICONST (cfg, mask_reg, mask_val); + + dot = emit_simd_ins (cfg, klass, instc, args [0]->dreg, args [1]->dreg); + dot->sreg3 = mask_reg; + } else { + dot = emit_simd_ins (cfg, klass, instc, args [0]->dreg, args [1]->dreg); + dot->inst_c0 = mask_val; + } return extract_first_element (cfg, klass, arg0_type, dot->dreg); } else { @@ -1761,6 +1769,10 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1) return NULL; // We don't support sum vector for byte, sbyte types yet + // FIXME: + if (!COMPILE_LLVM (cfg) && !(arg0_type == MONO_TYPE_I4 || arg0_type == MONO_TYPE_U4)) + return NULL; + instc = OP_IMUL; } MonoInst *pairwise_multiply = emit_simd_ins_for_sig (cfg, klass, OP_XBINOP, instc, arg0_type, fsig, args); From fdf3d4df939a4a233e120b02c049a55bccdc28ff Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Thu, 8 Jun 2023 04:21:42 -0400 Subject: [PATCH 06/21] Disable Dot for integer types. --- src/mono/mono/mini/mini-amd64.c | 4 ++-- src/mono/mono/mini/simd-intrinsics.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index 65fbaab33e299..f6c0bfd83c29d 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -6680,8 +6680,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) switch (ins->inst_c0) { case OP_IMUL: switch (ins->inst_c1) { - case MONO_TYPE_I4: - case MONO_TYPE_U4: + case MONO_TYPE_I8: + case MONO_TYPE_U8: amd64_sse_pmuludq_reg_reg (code, ins->sreg1, ins->sreg2); break; default: diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 119ff7760d7c0..dbd54f400b165 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1770,7 +1770,7 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return NULL; // We don't support sum vector for byte, sbyte types yet // FIXME: - if (!COMPILE_LLVM (cfg) && !(arg0_type == MONO_TYPE_I4 || arg0_type == MONO_TYPE_U4)) + if (!COMPILE_LLVM (cfg)) return NULL; instc = OP_IMUL; From c993b5073afd106053a8d83cbbf72dd06eb2e36c Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Thu, 8 Jun 2023 06:26:27 -0400 Subject: [PATCH 07/21] Add support for Create. --- src/mono/mono/arch/amd64/amd64-codegen.h | 12 ++++++++++- src/mono/mono/mini/cpu-amd64.mdesc | 5 +++++ src/mono/mono/mini/mini-amd64.c | 26 ++++++++++++++++++++++++ src/mono/mono/mini/mini-ops.h | 5 ++++- src/mono/mono/mini/simd-intrinsics.c | 6 +++++- 5 files changed, 51 insertions(+), 3 deletions(-) diff --git a/src/mono/mono/arch/amd64/amd64-codegen.h b/src/mono/mono/arch/amd64/amd64-codegen.h index 6f90941265cb2..60d962ada13da 100644 --- a/src/mono/mono/arch/amd64/amd64-codegen.h +++ b/src/mono/mono/arch/amd64/amd64-codegen.h @@ -776,6 +776,13 @@ typedef union { amd64_codegen_post(inst); \ } while (0) +#define emit_sse_reg_reg_op4_size_imm(inst,dreg,reg,op1,op2,op3,op4,size,imm) do { \ + amd64_codegen_pre(inst); \ + emit_sse_reg_reg_op4_size ((inst), (dreg), (reg), (op1), (op2), (op3), (op4), (size)); \ + x86_imm_emit8 ((inst), (imm)); \ + amd64_codegen_post(inst); \ +} while (0) + /* specific SSE opcode defines */ #define amd64_sse_xorpd_reg_reg(inst,dreg,reg) emit_sse_reg_reg ((inst),(dreg),(reg), 0x66, 0x0f, 0x57) @@ -836,8 +843,11 @@ typedef union { #define amd64_sse_sqrtsd_reg_reg(inst,dreg,reg) emit_sse_reg_reg((inst), (dreg), (reg), 0xf2, 0x0f, 0x51) - +#define amd64_sse_pinsrb_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op4_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x20, (imm)) +#define amd64_sse_pinsrd_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op4_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x22, (imm)) +#define amd64_sse_pinsrq_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op4_size_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x22, 8, (imm)) #define amd64_sse_pinsrw_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0xc4, (imm)) +#define amd64_sse_insertps_reg_reg(inst,dreg,reg,imm) emit_sse_reg_reg_op4_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x21, (imm)) #define amd64_sse_pextrw_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0xc5, (imm)) diff --git a/src/mono/mono/mini/cpu-amd64.mdesc b/src/mono/mono/mini/cpu-amd64.mdesc index c7b8867e9851c..9d772fb513637 100644 --- a/src/mono/mono/mini/cpu-amd64.mdesc +++ b/src/mono/mono/mini/cpu-amd64.mdesc @@ -790,7 +790,12 @@ extract_r4: dest:f src1:x len:5 iconv_to_r4_raw: dest:f src1:i len:10 +insert_i1: dest:x src1:x src2:i len:7 clob:1 insert_i2: dest:x src1:x src2:i len:6 clob:1 +insert_i4: dest:x src1:x src2:i len:7 clob:1 +insert_i8: dest:x src1:x src2:i len:7 clob:1 +insert_r4: dest:x src1:x src2:f len:7 clob:1 +insert_r8: dest:x src1:x src2:f len:24 clob:1 extractx_u2: dest:i src1:x len:6 insertx_u1_slow: dest:x src1:i src2:i len:18 clob:x diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index f6c0bfd83c29d..c632f1c4ab04d 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -7275,9 +7275,35 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) g_assert (!ins->inst_c0); amd64_sse_movss_reg_reg (code, ins->dreg, ins->sreg1); break; + case OP_INSERT_I1: + amd64_sse_pinsrb_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0); + break; case OP_INSERT_I2: amd64_sse_pinsrw_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0); break; + case OP_INSERT_I4: + amd64_sse_pinsrd_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0); + break; + case OP_INSERT_I8: + amd64_sse_pinsrq_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0); + break; + case OP_INSERT_R4: { + guint8 imm = (0 << 6) | (ins->inst_c0 << 4); + amd64_sse_insertps_reg_reg (code, ins->sreg1, ins->sreg2, imm); + break; + } + case OP_INSERT_R8: { + if (ins->inst_c0 == 0) { + amd64_sse_orpd_reg_reg (code, ins->dreg, ins->sreg2); + } else { + guint8 imm = 0b01001110; + amd64_sse_movaps_reg_reg (code, GP_SCRATCH_REG, ins->sreg2); + amd64_sse_pshufd_reg_reg_imm (code, GP_SCRATCH_REG, GP_SCRATCH_REG, imm); + amd64_sse_orpd_reg_reg (code, GP_SCRATCH_REG, ins->sreg1); + amd64_sse_movaps_reg_reg (code, ins->dreg, GP_SCRATCH_REG); + } + break; + } case OP_EXTRACTX_U2: amd64_sse_pextrw_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0); break; diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index 9d579ffb22b8b..c74f3f47bf9ac 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -817,7 +817,10 @@ MINI_OP(OP_EXTRACT_R4, "extract_r4", FREG, XREG, NONE) MINI_OP(OP_EXTRACT_R8, "extract_r8", FREG, XREG, NONE) MINI_OP(OP_EXTRACTX_U2, "extractx_u2", IREG, XREG, NONE) -/* Used by LLVM */ +/* + * Insert an element into a vector with a constant lane index. + * inst_c0 is the lane index. + */ MINI_OP(OP_INSERT_I1, "insert_i1", XREG, XREG, IREG) MINI_OP(OP_INSERT_I2, "insert_i2", XREG, XREG, IREG) MINI_OP(OP_INSERT_I4, "insert_i4", XREG, XREG, IREG) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index dbd54f400b165..16f1e04f0a1de 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1425,7 +1425,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_ConvertToSingle: case SN_ConvertToUInt32: case SN_ConvertToUInt64: - case SN_Create: case SN_CreateScalar: case SN_CreateScalarUnsafe: case SN_Equals: @@ -1680,6 +1679,11 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi // Require Vector64 SIMD support if (!COMPILE_LLVM (cfg)) return NULL; +#endif +#if defined(TARGET_AMD64) + // Require Vector64 SIMD support + if (!COMPILE_LLVM (cfg)) + return NULL; #endif return emit_simd_ins (cfg, klass, OP_XCONCAT, args [0]->dreg, args [1]->dreg); } From fed31374efa601e267f2ffa69d69f2b275aa8471 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Thu, 8 Jun 2023 14:00:16 -0400 Subject: [PATCH 08/21] Fix some typos in the intrinsics tests. --- .../Shared/VectorGetAndWithElementTest.template | 14 +++++++------- .../VectorGetAndWithLowerAndUpperTest.template | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorGetAndWithElementTest.template b/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorGetAndWithElementTest.template index bc4f82d21a2e5..68d0af4ed576d 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorGetAndWithElementTest.template +++ b/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorGetAndWithElementTest.template @@ -75,7 +75,7 @@ namespace JIT.HardwareIntrinsics.General if (!succeeded) { - TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}.GetElement({imm}): {nameof(RunBasicScenario)} failed to throw ArgumentOutOfRangeException."); + TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}>.GetElement({imm}): {nameof(RunBasicScenario)} failed to throw ArgumentOutOfRangeException."); TestLibrary.TestFramework.LogInformation(string.Empty); Succeeded = false; @@ -97,7 +97,7 @@ namespace JIT.HardwareIntrinsics.General if (!succeeded) { - TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}.WithElement({imm}): {nameof(RunBasicScenario)} failed to throw ArgumentOutOfRangeException."); + TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}>.WithElement({imm}): {nameof(RunBasicScenario)} failed to throw ArgumentOutOfRangeException."); TestLibrary.TestFramework.LogInformation(string.Empty); Succeeded = false; @@ -135,7 +135,7 @@ namespace JIT.HardwareIntrinsics.General if (!succeeded) { - TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}.GetElement({imm}): {nameof(RunReflectionScenario)} failed to throw ArgumentOutOfRangeException."); + TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}>.GetElement({imm}): {nameof(RunReflectionScenario)} failed to throw ArgumentOutOfRangeException."); TestLibrary.TestFramework.LogInformation(string.Empty); Succeeded = false; @@ -161,7 +161,7 @@ namespace JIT.HardwareIntrinsics.General if (!succeeded) { - TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}.WithElement({imm}): {nameof(RunReflectionScenario)} failed to throw ArgumentOutOfRangeException."); + TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}>.WithElement({imm}): {nameof(RunReflectionScenario)} failed to throw ArgumentOutOfRangeException."); TestLibrary.TestFramework.LogInformation(string.Empty); Succeeded = false; @@ -183,7 +183,7 @@ namespace JIT.HardwareIntrinsics.General { Succeeded = false; - TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}.GetElement({Imm}): {method} failed:"); + TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}>.GetElement({Imm}): {method} failed:"); TestLibrary.TestFramework.LogInformation($" value: ({string.Join(", ", values)})"); TestLibrary.TestFramework.LogInformation($" result: ({result})"); TestLibrary.TestFramework.LogInformation(string.Empty); @@ -217,9 +217,9 @@ namespace JIT.HardwareIntrinsics.General if (!succeeded) { - TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}.WithElement({Imm}): {method} failed:"); + TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}>.WithElement({Imm}): {method} failed:"); TestLibrary.TestFramework.LogInformation($" value: ({string.Join(", ", values)})"); - TestLibrary.TestFramework.LogInformation($" insert: insertedValue"); + TestLibrary.TestFramework.LogInformation($" insert: {insertedValue}"); TestLibrary.TestFramework.LogInformation($" result: ({string.Join(", ", result)})"); TestLibrary.TestFramework.LogInformation(string.Empty); diff --git a/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorGetAndWithLowerAndUpperTest.template b/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorGetAndWithLowerAndUpperTest.template index 1cee5a952a3ee..1a648dab4a54a 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorGetAndWithLowerAndUpperTest.template +++ b/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorGetAndWithLowerAndUpperTest.template @@ -178,7 +178,7 @@ namespace JIT.HardwareIntrinsics.General if (!succeeded) { - TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}.WithLower(): {method} failed:"); + TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}>.WithLower(): {method} failed:"); TestLibrary.TestFramework.LogInformation($" value: ({string.Join(", ", values)})"); TestLibrary.TestFramework.LogInformation($" result: ({string.Join(", ", result)})"); TestLibrary.TestFramework.LogInformation(string.Empty); @@ -199,7 +199,7 @@ namespace JIT.HardwareIntrinsics.General if (!succeeded) { - TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}.WithUpper(): {method} failed:"); + TestLibrary.TestFramework.LogInformation($"{VectorType}<{BaseType}>.WithUpper(): {method} failed:"); TestLibrary.TestFramework.LogInformation($" value: ({string.Join(", ", values)})"); TestLibrary.TestFramework.LogInformation($" result: ({string.Join(", ", result)})"); TestLibrary.TestFramework.LogInformation(string.Empty); From f7647b39fa976caa90f583b463eb7e375dd5dfe2 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Thu, 8 Jun 2023 16:11:07 -0400 Subject: [PATCH 09/21] Enable As methods. --- src/mono/mono/mini/simd-intrinsics.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 16f1e04f0a1de..235b25216e3dd 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1406,17 +1406,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi switch (id) { case SN_Abs: case SN_AndNot: - case SN_As: - case SN_AsByte: - case SN_AsDouble: - case SN_AsInt16: - case SN_AsInt32: - case SN_AsInt64: - case SN_AsSByte: - case SN_AsSingle: - case SN_AsUInt16: - case SN_AsUInt32: - case SN_AsUInt64: case SN_Ceiling: case SN_ConditionalSelect: case SN_ConvertToDouble: From 2b92851d4c4a670350e47635c6c1e7e2f8044998 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Fri, 9 Jun 2023 04:58:41 -0400 Subject: [PATCH 10/21] Enable Abs/AndNot. Simplify INSERT_R8. --- src/mono/mono/mini/cpu-amd64.mdesc | 1 + src/mono/mono/mini/mini-amd64.c | 11 ++++++----- src/mono/mono/mini/simd-intrinsics.c | 6 ++++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/mono/mono/mini/cpu-amd64.mdesc b/src/mono/mono/mini/cpu-amd64.mdesc index 9d772fb513637..df61041bd88ae 100644 --- a/src/mono/mono/mini/cpu-amd64.mdesc +++ b/src/mono/mono/mini/cpu-amd64.mdesc @@ -826,6 +826,7 @@ expand_r8: dest:x src1:f len:13 xop_x_x_x: dest:x src1:x src2:x len:16 clob:1 sse41_dpps_imm: dest:x src1:x src2:x len:7 clob:1 sse41_dppd_imm: dest:x src1:x src2:x len:7 clob:1 +vector_andnot: dest:x src1:x src2:x len:7 clob:1 roundp: dest:x src1:x len:10 diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index c632f1c4ab04d..689e5d22ca348 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -6952,6 +6952,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_PXOR: amd64_sse_pxor_reg_reg (code, ins->sreg1, ins->sreg2); break; + case OP_VECTOR_ANDN: + g_assert (ins->dreg == ins->sreg1); + amd64_sse_pandn_reg_reg (code, ins->dreg, ins->sreg2); + break; case OP_PADDB: amd64_sse_paddb_reg_reg (code, ins->sreg1, ins->sreg2); @@ -7296,11 +7300,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) if (ins->inst_c0 == 0) { amd64_sse_orpd_reg_reg (code, ins->dreg, ins->sreg2); } else { - guint8 imm = 0b01001110; - amd64_sse_movaps_reg_reg (code, GP_SCRATCH_REG, ins->sreg2); - amd64_sse_pshufd_reg_reg_imm (code, GP_SCRATCH_REG, GP_SCRATCH_REG, imm); - amd64_sse_orpd_reg_reg (code, GP_SCRATCH_REG, ins->sreg1); - amd64_sse_movaps_reg_reg (code, ins->dreg, GP_SCRATCH_REG); + g_assert (ins->inst_c0 == 1); + amd64_movlhps_reg_reg (code, ins->dreg, ins->sreg2); } break; } diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 235b25216e3dd..3f2251f3da3ec 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1404,8 +1404,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return NULL; #endif switch (id) { - case SN_Abs: - case SN_AndNot: case SN_Ceiling: case SN_ConditionalSelect: case SN_ConvertToDouble: @@ -1485,10 +1483,14 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi // args [0] & ~vector(-0.0) MonoInst *zero = emit_xzero(cfg, arg_class); // 0.0 zero = emit_simd_ins (cfg, klass, OP_NEGATION, zero->dreg, -1); // -0.0 + zero->inst_c1 = arg0_type; MonoInst *ins = emit_simd_ins (cfg, klass, OP_VECTOR_ANDN, zero->dreg, args [0]->dreg); ins->inst_c1 = arg0_type; return ins; } else { + if (!COMPILE_LLVM (cfg)) + // FIXME: + return NULL; return emit_simd_ins_for_sig (cfg, klass, OP_VECTOR_IABS, -1, arg0_type, fsig, args); } #elif defined(TARGET_WASM) From 578b813070b1d17ac8d8910d89787c7301bd45a6 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Fri, 9 Jun 2023 09:34:05 -0400 Subject: [PATCH 11/21] Add more methods. --- src/mono/mono/mini/cpu-amd64.mdesc | 2 +- src/mono/mono/mini/mini-amd64.c | 16 ++++++++++++---- src/mono/mono/mini/simd-intrinsics.c | 12 ++---------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/mono/mono/mini/cpu-amd64.mdesc b/src/mono/mono/mini/cpu-amd64.mdesc index df61041bd88ae..3c3869fd7506b 100644 --- a/src/mono/mono/mini/cpu-amd64.mdesc +++ b/src/mono/mono/mini/cpu-amd64.mdesc @@ -786,7 +786,7 @@ extract_i8: dest:i src1:x len:9 extract_i2: dest:i src1:x len:13 extract_i1: dest:i src1:x len:13 extract_r8: dest:f src1:x len:5 -extract_r4: dest:f src1:x len:5 +extract_r4: dest:f src1:x len:24 iconv_to_r4_raw: dest:f src1:i len:10 diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index 689e5d22ca348..b225d337862d3 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -7275,10 +7275,18 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) else amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg1); break; - case OP_EXTRACT_R4: - g_assert (!ins->inst_c0); - amd64_sse_movss_reg_reg (code, ins->dreg, ins->sreg1); + case OP_EXTRACT_R4: { + if (ins->inst_c0 == 0) { + amd64_sse_movss_reg_reg (code, ins->dreg, ins->sreg1); + } else { + int imm = ins->inst_c0; + amd64_sse_movaps_reg_reg (code, SIMD_TEMP_REG, ins->sreg1); + amd64_sse_shufps_reg_reg_imm (code, SIMD_TEMP_REG, ins->sreg1, imm); + amd64_sse_pxor_reg_reg (code, ins->dreg, ins->dreg); + amd64_sse_movss_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + } break; + } case OP_INSERT_I1: amd64_sse_pinsrb_reg_reg_imm (code, ins->sreg1, ins->sreg2, ins->inst_c0); break; @@ -7456,7 +7464,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) if (ins->inst_c1 == MONO_TYPE_R8) amd64_sse_roundpd_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0); else - g_assert_not_reached (); // roundps, but it's not used anywhere for non-llvm back-end yet. + amd64_sse_roundps_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0); break; } #endif diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 3f2251f3da3ec..40fa7dd233c4c 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1404,7 +1404,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return NULL; #endif switch (id) { - case SN_Ceiling: case SN_ConditionalSelect: case SN_ConvertToDouble: case SN_ConvertToInt32: @@ -1418,8 +1417,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_EqualsAll: case SN_EqualsAny: case SN_ExtractMostSignificantBits: - case SN_Floor: - case SN_GetElement: case SN_GetLower: case SN_GetUpper: case SN_GreaterThan: @@ -1435,8 +1432,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_LessThanOrEqualAll: case SN_LessThanOrEqualAny: case SN_Narrow: - case SN_Negate: - case SN_OnesComplement: case SN_Shuffle: case SN_Sqrt: case SN_Sum: @@ -1445,8 +1440,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_ToVector128Unsafe: case SN_WidenLower: case SN_WidenUpper: - case SN_WithElement: - case SN_get_IsHardwareAccelerated: return NULL; default: break; @@ -2207,14 +2200,13 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return ins; } - if (!COMPILE_LLVM(cfg) && fsig->params [0]->type != MONO_TYPE_GENERICINST) { + if (!COMPILE_LLVM (cfg) && fsig->params [0]->type != MONO_TYPE_GENERICINST) return NULL; - } MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, args [1]->dreg, elems); MONO_EMIT_NEW_COND_EXC (cfg, GE_UN, "ArgumentOutOfRangeException"); - if (COMPILE_LLVM(cfg) || type_to_width_log2 (arg0_type) == 3) { + if (COMPILE_LLVM (cfg) || type_to_width_log2 (arg0_type) == 3) { int insert_op = type_to_xinsert_op (arg0_type); MonoInst *ins = emit_simd_ins (cfg, klass, insert_op, args [0]->dreg, args [2]->dreg); ins->sreg3 = args [1]->dreg; From 1a0fc58fc15d05201b15065300a508f733e479a1 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Fri, 9 Jun 2023 18:18:58 -0400 Subject: [PATCH 12/21] Add compare methods. --- src/mono/mono/mini/cpu-amd64.mdesc | 1 + src/mono/mono/mini/mini-amd64.c | 19 +++++++++++++++++++ src/mono/mono/mini/mini-ops.h | 10 ++++++++-- src/mono/mono/mini/simd-intrinsics.c | 15 --------------- 4 files changed, 28 insertions(+), 17 deletions(-) diff --git a/src/mono/mono/mini/cpu-amd64.mdesc b/src/mono/mono/mini/cpu-amd64.mdesc index 3c3869fd7506b..eccb8eb18adfa 100644 --- a/src/mono/mono/mini/cpu-amd64.mdesc +++ b/src/mono/mono/mini/cpu-amd64.mdesc @@ -787,6 +787,7 @@ extract_i2: dest:i src1:x len:13 extract_i1: dest:i src1:x len:13 extract_r8: dest:f src1:x len:5 extract_r4: dest:f src1:x len:24 +xextract: dest:i src1:x len:24 iconv_to_r4_raw: dest:f src1:i len:10 diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index b225d337862d3..a813a7fe6b233 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -7371,6 +7371,25 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) else amd64_sse_movsd_reg_reg (code, ins->dreg, ins->sreg2); break; + case OP_XEXTRACT: { + /* Elements are either 0 or 0xff */ + g_assert (ins->inst_c1 == 16); + amd64_sse_pmovmskb_reg_reg (code, ins->dreg, ins->sreg1); + if (ins->inst_c0 == SIMD_EXTR_ARE_ALL_SET) { + /* dreg = (mask == 0xffff) */ + amd64_alu_reg_imm_size (code, X86_CMP, ins->dreg, 0xffff, 4); + amd64_set_reg (code, X86_CC_EQ, ins->dreg, FALSE); + amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE); + } else if (ins->inst_c0 == SIMD_EXTR_IS_ANY_SET) { + /* dreg = (mask != 0) */ + amd64_alu_reg_imm_size (code, X86_CMP, ins->dreg, 0, 4); + amd64_set_reg (code, X86_CC_NE, ins->dreg, FALSE); + amd64_widen_reg (code, ins->dreg, ins->dreg, FALSE, FALSE); + } else { + g_assert_not_reached (); + } + break; + } case OP_STOREX_MEMBASE_REG: case OP_STOREX_MEMBASE: amd64_sse_movups_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1); diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index c74f3f47bf9ac..793d8e46984a5 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -1511,8 +1511,14 @@ MINI_OP(OP_XCOMPARE_SCALAR, "xcompare_scalar", XREG, XREG, XREG) MINI_OP(OP_XCOMPARE_FP, "xcompare_fp", XREG, XREG, XREG) MINI_OP(OP_XCOMPARE_FP_SCALAR, "xcompare_fp_scalar", XREG, XREG, XREG) -/* Extract from XREG into IREG. - * inst_c0 - specific instruction, one of SIMD_EXTR_... */ +/* + * The input reg is the result ofg OP_XCOMPARE, i.e. + * every element is either 0 or 0xff. + * Compute an integer result based on whenever all or any + * bits are non-zero. + * inst_c0 - specific instruction, one of SIMD_EXTR_... + * inst_c1 - vector size in bytes + */ MINI_OP(OP_XEXTRACT, "xextract", IREG, XREG, NONE) /* diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 40fa7dd233c4c..5b8e542770dfd 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1413,24 +1413,9 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_ConvertToUInt64: case SN_CreateScalar: case SN_CreateScalarUnsafe: - case SN_Equals: - case SN_EqualsAll: - case SN_EqualsAny: case SN_ExtractMostSignificantBits: case SN_GetLower: case SN_GetUpper: - case SN_GreaterThan: - case SN_GreaterThanAll: - case SN_GreaterThanAny: - case SN_GreaterThanOrEqual: - case SN_GreaterThanOrEqualAll: - case SN_GreaterThanOrEqualAny: - case SN_LessThan: - case SN_LessThanAll: - case SN_LessThanAny: - case SN_LessThanOrEqual: - case SN_LessThanOrEqualAll: - case SN_LessThanOrEqualAny: case SN_Narrow: case SN_Shuffle: case SN_Sqrt: From 013cb2a1444032ea7c11c7f243c52feb127b236e Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Fri, 9 Jun 2023 18:59:52 -0400 Subject: [PATCH 13/21] Add CreateScalar/CreateScalarUnsafe. --- src/mono/mono/mini/simd-intrinsics.c | 39 ++++++++++++---------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 5b8e542770dfd..5d52c4d17671b 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1411,8 +1411,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_ConvertToSingle: case SN_ConvertToUInt32: case SN_ConvertToUInt64: - case SN_CreateScalar: - case SN_CreateScalarUnsafe: case SN_ExtractMostSignificantBits: case SN_GetLower: case SN_GetUpper: @@ -1660,33 +1658,30 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return emit_vector_create_elementwise (cfg, fsig, fsig->ret, arg0_type, args); break; } - case SN_CreateScalar: { - MonoType *etype = get_vector_t_elem_type (fsig->ret); - if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype)) - return NULL; - if (COMPILE_LLVM (cfg)) - return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR, -1, arg0_type, fsig, args); - else { - if (type_enum_is_float (arg0_type)) { - return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_FLOAT, -1, arg0_type, fsig, args); - } else { - return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_INT, -1, arg0_type, fsig, args); - } - } - - } + case SN_CreateScalar: case SN_CreateScalarUnsafe: { MonoType *etype = get_vector_t_elem_type (fsig->ret); if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype)) return NULL; - if (COMPILE_LLVM (cfg)) - return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_UNSAFE, -1, arg0_type, fsig, args); - else { + gboolean is_unsafe = id == SN_CreateScalarUnsafe; + if (COMPILE_LLVM (cfg)) { + return emit_simd_ins_for_sig (cfg, klass, is_unsafe ? OP_CREATE_SCALAR_UNSAFE : OP_CREATE_SCALAR, -1, arg0_type, fsig, args); + } else { +#ifdef TARGET_AMD64 + MonoInst *ins; + + ins = emit_xzero (cfg, klass); + ins = emit_simd_ins (cfg, klass, type_to_insert_op (arg0_type), ins->dreg, args [0]->dreg); + ins->inst_c0 = 0; + ins->inst_c1 = arg0_type; + return ins; +#else if (type_enum_is_float (arg0_type)) { - return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_UNSAFE_FLOAT, -1, arg0_type, fsig, args); + return emit_simd_ins_for_sig (cfg, klass, is_unsafe ? OP_CREATE_SCALAR_UNSAFE_FLOAT : OP_CREATE_SCALAR_FLOAT, -1, arg0_type, fsig, args); } else { - return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_UNSAFE_INT, -1, arg0_type, fsig, args); + return emit_simd_ins_for_sig (cfg, klass, is_unsafe ? OP_CREATE_SCALAR_UNSAFE_INT : OP_CREATE_SCALAR_INT, -1, arg0_type, fsig, args); } +#endif } } case SN_Dot: { From 202019632185359d05c25f208eafc016189c12b3 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Fri, 9 Jun 2023 20:05:26 -0400 Subject: [PATCH 14/21] Add Sqrt/Sum. --- src/mono/mono/arch/amd64/amd64-codegen.h | 1 + src/mono/mono/mini/cpu-amd64.mdesc | 5 +++-- src/mono/mono/mini/mini-amd64.c | 24 ++++++++++++++++++++++++ src/mono/mono/mini/mini-ops.h | 2 ++ src/mono/mono/mini/simd-intrinsics.c | 2 -- 5 files changed, 30 insertions(+), 4 deletions(-) diff --git a/src/mono/mono/arch/amd64/amd64-codegen.h b/src/mono/mono/arch/amd64/amd64-codegen.h index 60d962ada13da..69d915fddc56a 100644 --- a/src/mono/mono/arch/amd64/amd64-codegen.h +++ b/src/mono/mono/arch/amd64/amd64-codegen.h @@ -848,6 +848,7 @@ typedef union { #define amd64_sse_pinsrq_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op4_size_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x22, 8, (imm)) #define amd64_sse_pinsrw_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0xc4, (imm)) #define amd64_sse_insertps_reg_reg(inst,dreg,reg,imm) emit_sse_reg_reg_op4_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x21, (imm)) +#define amd64_sse_pblendw_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op4_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x0e, (imm)) #define amd64_sse_pextrw_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0xc5, (imm)) diff --git a/src/mono/mono/mini/cpu-amd64.mdesc b/src/mono/mono/mini/cpu-amd64.mdesc index eccb8eb18adfa..9b79856a797b5 100644 --- a/src/mono/mono/mini/cpu-amd64.mdesc +++ b/src/mono/mono/mini/cpu-amd64.mdesc @@ -604,8 +604,8 @@ xbinop: dest:x src1:x src2:x len:7 clob:1 xbinop_forceint: dest:x src1:x src2:x len:7 clob:1 ones_complement: dest:x src1:x len:16 clob:1 negate: dest:x src1:x len:24 clob:1 -xlower: dest:x src1:x len:7 clob:1 -xupper: dest:x src1:x len:7 clob:1 +xlower: dest:x src1:x len:16 +xupper: dest:x src1:x len:16 addps: dest:x src1:x src2:x len:4 clob:1 divps: dest:x src1:x src2:x len:4 clob:1 @@ -825,6 +825,7 @@ expand_i8: dest:x src1:i len:11 expand_r4: dest:x src1:f len:16 expand_r8: dest:x src1:f len:13 xop_x_x_x: dest:x src1:x src2:x len:16 clob:1 +xop_x_x: dest:x src1:x len:16 clob:1 sse41_dpps_imm: dest:x src1:x src2:x len:7 clob:1 sse41_dppd_imm: dest:x src1:x src2:x len:7 clob:1 vector_andnot: dest:x src1:x src2:x len:7 clob:1 diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index a813a7fe6b233..67485ca771e12 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -6732,6 +6732,20 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) } break; } + case OP_XOP_X_X: { + switch (ins->inst_c0) { + case INTRINS_SIMD_SQRT_R4: + amd64_sse_sqrtps_reg_reg (code, ins->dreg, ins->sreg1); + break; + case INTRINS_SIMD_SQRT_R8: + amd64_sse_sqrtpd_reg_reg (code, ins->dreg, ins->sreg1); + break; + default: + g_assert_not_reached (); + break; + } + break; + } case OP_SSE41_DPPS_IMM: amd64_sse_dpps_reg_reg (code, ins->dreg, ins->sreg2, ins->inst_c0); break; @@ -7390,6 +7404,16 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) } break; } + case OP_XLOWER: + amd64_sse_pxor_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_sse_pblendw_reg_reg_imm (code, SIMD_TEMP_REG, ins->sreg1, 0b1111); + amd64_sse_movaps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; + case OP_XUPPER: + amd64_sse_pxor_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_movhlps_reg_reg (code, SIMD_TEMP_REG, ins->sreg1); + amd64_sse_movaps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); + break; case OP_STOREX_MEMBASE_REG: case OP_STOREX_MEMBASE: amd64_sse_movups_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1); diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index 793d8e46984a5..b7c1b268eff69 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -1559,7 +1559,9 @@ MINI_OP(OP_XOP_OVR_BYSCALAR_X_X_X, "xop_ovr_byscalar_x_x_x", XREG, XREG, XREG) MINI_OP(OP_XCONCAT, "xconcat", XREG, XREG, XREG) MINI_OP(OP_XCAST, "xcast", XREG, XREG, NONE) +/* Return a new vector containing the lower half of the source */ MINI_OP(OP_XLOWER, "xlower", XREG, XREG, NONE) +/* Return a new vector containing the upper half of the source */ MINI_OP(OP_XUPPER, "xupper", XREG, XREG, NONE) MINI_OP(OP_XWIDEN, "xwiden", XREG, XREG, NONE) MINI_OP(OP_XWIDEN_UNSAFE, "xwiden_unsafe", XREG, XREG, NONE) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 5d52c4d17671b..2fc2eb59e6d59 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1416,8 +1416,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_GetUpper: case SN_Narrow: case SN_Shuffle: - case SN_Sqrt: - case SN_Sum: case SN_ToScalar: case SN_ToVector128: case SN_ToVector128Unsafe: From 4b970d5f3c29cbb44aa96ee9af5e637a55bf82f7 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Fri, 9 Jun 2023 21:47:28 -0400 Subject: [PATCH 15/21] Fix GetElement/WithElement. --- src/mono/mono/mini/simd-intrinsics.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 2fc2eb59e6d59..47e86b12a7f72 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1412,8 +1412,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_ConvertToUInt32: case SN_ConvertToUInt64: case SN_ExtractMostSignificantBits: - case SN_GetLower: - case SN_GetUpper: case SN_Narrow: case SN_Shuffle: case SN_ToScalar: @@ -1422,6 +1420,10 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_WidenLower: case SN_WidenUpper: return NULL; + case SN_GetLower: + case SN_GetUpper: + /* These return a Vector64 */ + return NULL; default: break; } @@ -1869,7 +1871,14 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, args [1]->dreg, elems); MONO_EMIT_NEW_COND_EXC (cfg, GE_UN, "ArgumentOutOfRangeException"); - if (COMPILE_LLVM(cfg) || type_to_width_log2 (arg0_type) == 3) { + gboolean use_xextract; +#ifdef TARGET_AMD64 + use_xextract = FALSE; +#else + use_xextract = type_to_width_log2 (arg0_type) == 3; +#endif + + if (COMPILE_LLVM (cfg) || use_xextract) { // Use optimized paths for 64-bit extractions or whatever LLVM yields if enabled. int extract_op = type_to_xextract_op (arg0_type); return emit_simd_ins_for_sig (cfg, klass, extract_op, -1, arg0_type, fsig, args); @@ -2184,7 +2193,14 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi MONO_EMIT_NEW_BIALU_IMM (cfg, OP_COMPARE_IMM, -1, args [1]->dreg, elems); MONO_EMIT_NEW_COND_EXC (cfg, GE_UN, "ArgumentOutOfRangeException"); - if (COMPILE_LLVM (cfg) || type_to_width_log2 (arg0_type) == 3) { + gboolean use_xextract; +#ifdef TARGET_AMD64 + use_xextract = FALSE; +#else + use_xextract = type_to_width_log2 (arg0_type) == 3; +#endif + + if (COMPILE_LLVM (cfg) || use_xextract) { int insert_op = type_to_xinsert_op (arg0_type); MonoInst *ins = emit_simd_ins (cfg, klass, insert_op, args [0]->dreg, args [2]->dreg); ins->sreg3 = args [1]->dreg; From 2c9964fed2edec7da85e5db2a8c85e9b5df2c4d5 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Sat, 10 Jun 2023 08:42:41 -0400 Subject: [PATCH 16/21] Implement some of the convert methods, disable the rest. --- src/mono/mono/mini/cpu-amd64.mdesc | 2 + src/mono/mono/mini/mini-amd64.c | 6 ++ src/mono/mono/mini/mini-ops.h | 1 + src/mono/mono/mini/simd-intrinsics.c | 100 +++++++++++++++++---------- 4 files changed, 74 insertions(+), 35 deletions(-) diff --git a/src/mono/mono/mini/cpu-amd64.mdesc b/src/mono/mono/mini/cpu-amd64.mdesc index 9b79856a797b5..aafd6b4f332b7 100644 --- a/src/mono/mono/mini/cpu-amd64.mdesc +++ b/src/mono/mono/mini/cpu-amd64.mdesc @@ -606,6 +606,8 @@ ones_complement: dest:x src1:x len:16 clob:1 negate: dest:x src1:x len:24 clob:1 xlower: dest:x src1:x len:16 xupper: dest:x src1:x len:16 +convert_fp_to_si: dest:x src1:x len:16 +convert_si_to_fp: dest:x src1:x len:16 addps: dest:x src1:x src2:x len:4 clob:1 divps: dest:x src1:x src2:x len:4 clob:1 diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index 67485ca771e12..d3cc1d03f88d6 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -7414,6 +7414,12 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) amd64_movhlps_reg_reg (code, SIMD_TEMP_REG, ins->sreg1); amd64_sse_movaps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); break; + case OP_CVT_FP_SI: + amd64_sse_cvttps2dq_reg_reg (code, ins->dreg, ins->sreg1); + break; + case OP_CVT_SI_FP: + amd64_sse_cvtdq2ps_reg_reg (code, ins->dreg, ins->sreg1); + break; case OP_STOREX_MEMBASE_REG: case OP_STOREX_MEMBASE: amd64_sse_movups_membase_reg (code, ins->dreg, ins->inst_offset, ins->sreg1); diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index b7c1b268eff69..8db4ec672ab50 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -1846,6 +1846,7 @@ MINI_OP(OP_CVT_SI_FP_SCALAR, "convert_si_to_fp_scalar", XREG, XREG, NONE) /* inst_c1 is one of the MONO_TYPE_ constants */ MINI_OP(OP_NEGATION, "negate", XREG, XREG, NONE) MINI_OP(OP_NEGATION_SCALAR, "negate_scalar", XREG, XREG, NONE) +/* Select bits from src2/src3 using src1 */ MINI_OP3(OP_BSL, "bitwise_select", XREG, XREG, XREG, XREG) #endif // TARGET_ARM64 || TARGET_AMD64 || TARGET_WASM diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 47e86b12a7f72..248b0412a9388 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1403,30 +1403,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi #ifdef TARGET_WIN32 return NULL; #endif - switch (id) { - case SN_ConditionalSelect: - case SN_ConvertToDouble: - case SN_ConvertToInt32: - case SN_ConvertToInt64: - case SN_ConvertToSingle: - case SN_ConvertToUInt32: - case SN_ConvertToUInt64: - case SN_ExtractMostSignificantBits: - case SN_Narrow: - case SN_Shuffle: - case SN_ToScalar: - case SN_ToVector128: - case SN_ToVector128Unsafe: - case SN_WidenLower: - case SN_WidenUpper: - return NULL; - case SN_GetLower: - case SN_GetUpper: - /* These return a Vector64 */ - return NULL; - default: - break; - } } #endif @@ -1541,9 +1517,22 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi #endif } case SN_ConditionalSelect: { -#if defined(TARGET_ARM64) || defined(TARGET_AMD64) || defined(TARGET_WASM) if (!is_element_type_primitive (fsig->params [0])) return NULL; + +#if defined(TARGET_ARM64) || defined(TARGET_AMD64) || defined(TARGET_WASM) + +#if defined(TARGET_AMD64) + if (!COMPILE_LLVM (cfg)) { + MonoInst *val1 = emit_simd_ins (cfg, klass, OP_XBINOP_FORCEINT, args [0]->dreg, args [1]->dreg); + val1->inst_c0 = XBINOP_FORCEINT_AND; + MonoInst *val2 = emit_simd_ins (cfg, klass, OP_VECTOR_ANDN, args [0]->dreg, args [2]->dreg); + MonoInst *ins = emit_simd_ins (cfg, klass, OP_XBINOP_FORCEINT, val1->dreg, val2->dreg); + ins->inst_c0 = XBINOP_FORCEINT_OR; + return ins; + } +#endif + return emit_simd_ins_for_sig (cfg, klass, OP_BSL, -1, arg0_type, fsig, args); #else return NULL; @@ -1567,6 +1556,13 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi op = arg0_type == MONO_TYPE_I8 ? OP_CVT_SI_FP_SCALAR : OP_CVT_UI_FP_SCALAR; else op = arg0_type == MONO_TYPE_I8 ? OP_CVT_SI_FP : OP_CVT_UI_FP; + +#ifdef TARGET_AMD64 + // Fall back to the c# code + if (!COMPILE_LLVM (cfg)) + return NULL; +#endif + return emit_simd_ins_for_sig (cfg, klass, op, -1, arg0_type, fsig, args); #else return NULL; @@ -1585,6 +1581,13 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } #endif #if defined(TARGET_ARM64) || defined(TARGET_AMD64) + +#if defined(TARGET_AMD64) + if (!COMPILE_LLVM (cfg) && id == SN_ConvertToUInt32) + // FIXME: + return NULL; +#endif + int op = id == SN_ConvertToInt32 ? OP_CVT_FP_SI : OP_CVT_FP_UI; return emit_simd_ins_for_sig (cfg, klass, op, -1, arg0_type, fsig, args); #else @@ -1611,6 +1614,13 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi op = size == 8 ? OP_CVT_FP_SI_SCALAR : OP_CVT_FP_SI; else op = size == 8 ? OP_CVT_FP_UI_SCALAR : OP_CVT_FP_UI; + +#if defined(TARGET_AMD64) + if (!COMPILE_LLVM (cfg)) + // FIXME: + return NULL; +#endif + return emit_simd_ins_for_sig (cfg, klass, op, -1, arg0_type, fsig, args); #else return NULL; @@ -1628,6 +1638,13 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi #endif #if defined(TARGET_ARM64) || defined(TARGET_AMD64) int op = arg0_type == MONO_TYPE_I4 ? OP_CVT_SI_FP : OP_CVT_UI_FP; + +#if defined(TARGET_AMD64) + if (!COMPILE_LLVM (cfg) && op == OP_CVT_UI_FP) + // FIXME: + return NULL; +#endif + return emit_simd_ins_for_sig (cfg, klass, op, -1, arg0_type, fsig, args); #else return NULL; @@ -1906,6 +1923,13 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (!is_element_type_primitive (fsig->params [0])) return NULL; int op = id == SN_GetLower ? OP_XLOWER : OP_XUPPER; + +#ifdef TARGET_AMD64 + if (!COMPILE_LLVM (cfg)) + /* These return a Vector64 */ + return NULL; +#endif + return emit_simd_ins_for_sig (cfg, klass, op, 0, arg0_type, fsig, args); } case SN_GreaterThan: @@ -2109,6 +2133,9 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (vector_size == 128 && (arg0_type == MONO_TYPE_I1 || arg0_type == MONO_TYPE_U1)) return emit_simd_ins_for_sig (cfg, klass, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_TBL1, 0, fsig, args); return NULL; +#elif defined(TARGET_AMD64) + // FIXME: + return NULL; #else return NULL; #endif @@ -2255,16 +2282,19 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } #endif #if defined(TARGET_ARM64) || defined(TARGET_WASM) - int op = id == SN_WidenLower ? OP_XLOWER : OP_XUPPER; - MonoInst *lower_or_upper_half = emit_simd_ins_for_sig (cfg, klass, op, 0, arg0_type, fsig, args); - if (type_enum_is_float (arg0_type)) { - return emit_simd_ins (cfg, klass, OP_SIMD_FCVTL, lower_or_upper_half->dreg, -1); - } else { - int zero = alloc_ireg (cfg); - MONO_EMIT_NEW_ICONST (cfg, zero, 0); - op = type_enum_is_unsigned (arg0_type) ? OP_SIMD_USHLL : OP_SIMD_SSHLL; - return emit_simd_ins (cfg, klass, op, lower_or_upper_half->dreg, zero); - } + int op = id == SN_WidenLower ? OP_XLOWER : OP_XUPPER; + MonoInst *lower_or_upper_half = emit_simd_ins_for_sig (cfg, klass, op, 0, arg0_type, fsig, args); + if (type_enum_is_float (arg0_type)) { + return emit_simd_ins (cfg, klass, OP_SIMD_FCVTL, lower_or_upper_half->dreg, -1); + } else { + int zero = alloc_ireg (cfg); + MONO_EMIT_NEW_ICONST (cfg, zero, 0); + op = type_enum_is_unsigned (arg0_type) ? OP_SIMD_USHLL : OP_SIMD_SSHLL; + return emit_simd_ins (cfg, klass, op, lower_or_upper_half->dreg, zero); + } +#elif defined(TARGET_AMD64) + // FIXME: + return NULL; #else return NULL; #endif From 9bbd66582c94df97a5b7a5591899b60268ffd006 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Sat, 10 Jun 2023 12:52:48 -0400 Subject: [PATCH 17/21] Fix GetElement. --- src/mono/mono/mini/cpu-amd64.mdesc | 3 +-- src/mono/mono/mini/mini-amd64.c | 6 +++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/mono/mono/mini/cpu-amd64.mdesc b/src/mono/mono/mini/cpu-amd64.mdesc index aafd6b4f332b7..0842ac8b470b6 100644 --- a/src/mono/mono/mini/cpu-amd64.mdesc +++ b/src/mono/mono/mini/cpu-amd64.mdesc @@ -781,10 +781,9 @@ xones: dest:x len:5 xconst: dest:x len:12 iconv_to_x: dest:x src1:i len:5 -extract_i4: dest:i src1:x len:5 +extract_i4: dest:i src1:x len:16 extract_i8: dest:i src1:x len:9 - extract_i2: dest:i src1:x len:13 extract_i1: dest:i src1:x len:13 extract_r8: dest:f src1:x len:5 diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index d3cc1d03f88d6..3e2caa3098202 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -7260,7 +7260,11 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) amd64_movd_xreg_reg_size (code, ins->dreg, ins->sreg1, 4); break; case OP_EXTRACT_I4: - amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4); + if (ins->inst_c0) { + amd64_sse_pextrd_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0); + } else { + amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4); + } break; case OP_EXTRACT_I8: if (ins->inst_c0) { From 300cc41279fb0aec45e519871e9844159c394f9a Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Sat, 10 Jun 2023 12:54:12 -0400 Subject: [PATCH 18/21] Require SSE41. --- src/mono/mono/arch/amd64/amd64-codegen.h | 2 +- src/mono/mono/mini/simd-intrinsics.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/mono/mono/arch/amd64/amd64-codegen.h b/src/mono/mono/arch/amd64/amd64-codegen.h index 69d915fddc56a..5907618868484 100644 --- a/src/mono/mono/arch/amd64/amd64-codegen.h +++ b/src/mono/mono/arch/amd64/amd64-codegen.h @@ -851,7 +851,7 @@ typedef union { #define amd64_sse_pblendw_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op4_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x0e, (imm)) #define amd64_sse_pextrw_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0xc5, (imm)) - +#define amd64_sse_pextrd_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op4_imm ((inst), (reg), (dreg), 0x66, 0x0f, 0x3a, 0x16, (imm)) #define amd64_sse_cvttsd2si_reg_xreg_size(inst,reg,xreg,size) emit_sse_reg_reg_size ((inst), (reg), (xreg), 0xf2, 0x0f, 0x2c, (size)) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 248b0412a9388..eb0f6e213b043 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1403,6 +1403,9 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi #ifdef TARGET_WIN32 return NULL; #endif + if (!is_SIMD_feature_supported (cfg, MONO_CPU_X86_SSE41)) + /* Some opcodes like pextrd require sse41 */ + return NULL; } #endif From fa9de7caa9019538bcb9203b6b4323df015d2239 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Sat, 10 Jun 2023 17:32:52 -0400 Subject: [PATCH 19/21] Fix OP_EXTRACT_I1. --- src/mono/mono/arch/amd64/amd64-codegen.h | 1 + src/mono/mono/mini/mini-amd64.c | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/mono/mono/arch/amd64/amd64-codegen.h b/src/mono/mono/arch/amd64/amd64-codegen.h index 5907618868484..8e4ec29460613 100644 --- a/src/mono/mono/arch/amd64/amd64-codegen.h +++ b/src/mono/mono/arch/amd64/amd64-codegen.h @@ -851,6 +851,7 @@ typedef union { #define amd64_sse_pblendw_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op4_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0x3a, 0x0e, (imm)) #define amd64_sse_pextrw_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_imm ((inst), (dreg), (reg), 0x66, 0x0f, 0xc5, (imm)) +#define amd64_sse_pextrb_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op4_imm ((inst), (reg), (dreg), 0x66, 0x0f, 0x3a, 0x14, (imm)) #define amd64_sse_pextrd_reg_reg_imm(inst,dreg,reg,imm) emit_sse_reg_reg_op4_imm ((inst), (reg), (dreg), 0x66, 0x0f, 0x3a, 0x16, (imm)) #define amd64_sse_cvttsd2si_reg_xreg_size(inst,reg,xreg,size) emit_sse_reg_reg_size ((inst), (reg), (xreg), 0xf2, 0x0f, 0x2c, (size)) diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index 3e2caa3098202..949d070ac305d 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -7275,9 +7275,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) } break; case OP_EXTRACT_I1: - amd64_movd_reg_xreg_size (code, ins->dreg, ins->sreg1, 4); - if (ins->inst_c0) - amd64_shift_reg_imm (code, X86_SHR, ins->dreg, ins->inst_c0 * 8); + amd64_sse_pextrb_reg_reg_imm (code, ins->dreg, ins->sreg1, ins->inst_c0); amd64_widen_reg (code, ins->dreg, ins->dreg, ins->inst_c1 == MONO_TYPE_I1, FALSE); break; case OP_EXTRACT_I2: From 54a655d2ae9b6aa7f53db1ddc3f8b16a926aafcf Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Mon, 12 Jun 2023 07:54:50 -0400 Subject: [PATCH 20/21] Fix INSERT_R8. --- src/mono/mono/arch/amd64/amd64-codegen.h | 1 + src/mono/mono/mini/mini-amd64.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mono/mono/arch/amd64/amd64-codegen.h b/src/mono/mono/arch/amd64/amd64-codegen.h index 8e4ec29460613..5ebb5ae5c38f4 100644 --- a/src/mono/mono/arch/amd64/amd64-codegen.h +++ b/src/mono/mono/arch/amd64/amd64-codegen.h @@ -1188,6 +1188,7 @@ typedef union { #define amd64_sse_dppd_reg_reg(inst, dreg, sreg, mask) emit_sse_reg_reg_op4_imm((inst), (dreg), (sreg), 0x66, 0x0f, 0x3a, 0x41, (mask)) #define amd64_sse_phaddw_reg_reg(inst, dreg, sreg) emit_sse_reg_reg_op4((inst), (dreg), (sreg), 0x66, 0x0f, 0x38, 0x01) #define amd64_sse_phaddd_reg_reg(inst, dreg, sreg) emit_sse_reg_reg_op4((inst), (dreg), (sreg), 0x66, 0x0f, 0x38, 0x02) +#define amd64_sse_blendpd_reg_reg(inst,dreg,sreg,imm) emit_sse_reg_reg_op4_imm((inst), (dreg), (sreg), 0x66, 0x0f, 0x3a, 0x0d, (imm)) #define amd64_movq_reg_reg(inst,dreg,sreg) emit_sse_reg_reg ((inst), (dreg), (sreg), 0xf3, 0x0f, 0x7e) /* Generated from x86-codegen.h */ diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index 949d070ac305d..203734b0abfd9 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -7322,7 +7322,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) } case OP_INSERT_R8: { if (ins->inst_c0 == 0) { - amd64_sse_orpd_reg_reg (code, ins->dreg, ins->sreg2); + amd64_sse_blendpd_reg_reg (code, ins->dreg, ins->sreg2, 0b1); } else { g_assert (ins->inst_c0 == 1); amd64_movlhps_reg_reg (code, ins->dreg, ins->sreg2); From cfaba986d6fc6846d27bdcf8a58fa1ea47ceb963 Mon Sep 17 00:00:00 2001 From: Zoltan Varga Date: Mon, 12 Jun 2023 08:48:47 -0400 Subject: [PATCH 21/21] Improve OP_NEGATION R4/R8. --- src/mono/mono/mini/mini-amd64.c | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/src/mono/mono/mini/mini-amd64.c b/src/mono/mono/mini/mini-amd64.c index 203734b0abfd9..9b7c90e45a13f 100644 --- a/src/mono/mono/mini/mini-amd64.c +++ b/src/mono/mono/mini/mini-amd64.c @@ -6783,33 +6783,17 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) amd64_sse_movaps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); break; case MONO_TYPE_R4: { - static float r8_0 [] = {-0.0, -0.0, -0.0, -0.0 }; - - if (cfg->compile_aot && cfg->code_exec_only) { - mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128_GOT, &r8_0); - amd64_mov_reg_membase (code, AMD64_R11, AMD64_RIP, 0, sizeof(target_mgreg_t)); - amd64_sse_movups_reg_membase (code, SIMD_TEMP_REG, AMD64_R11, 0); - } else { - mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128, &r8_0); - amd64_sse_movups_reg_membase (code, SIMD_TEMP_REG, AMD64_RIP, 0); - } - + /* -0.0 */ + amd64_sse_pcmpeqw_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_sse_pslld_reg_imm (code, SIMD_TEMP_REG, 31); g_assert (ins->sreg1 == ins->dreg); amd64_sse_xorps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); break; } case MONO_TYPE_R8: { - static double r8_0 [] = {-0.0, -0.0 }; - - if (cfg->compile_aot && cfg->code_exec_only) { - mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128_GOT, &r8_0); - amd64_mov_reg_membase (code, AMD64_R11, AMD64_RIP, 0, sizeof(target_mgreg_t)); - amd64_sse_movups_reg_membase (code, SIMD_TEMP_REG, AMD64_R11, 0); - } else { - mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128, &r8_0); - amd64_sse_movups_reg_membase (code, SIMD_TEMP_REG, AMD64_RIP, 0); - } - + /* -0.0 */ + amd64_sse_pcmpeqw_reg_reg (code, SIMD_TEMP_REG, SIMD_TEMP_REG); + amd64_sse_psllq_reg_imm (code, SIMD_TEMP_REG, 63); g_assert (ins->sreg1 == ins->dreg); amd64_sse_xorps_reg_reg (code, ins->dreg, SIMD_TEMP_REG); break;