Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[mono] Implement Arm intrinsics: ArmBase, Crc32 #34240

Merged
merged 10 commits into from
Apr 7, 2020
5 changes: 4 additions & 1 deletion src/mono/mono/mini/aot-compiler.c
Original file line number Diff line number Diff line change
Expand Up @@ -8111,7 +8111,10 @@ parse_cpu_features (const gchar *attr)
feature = (MonoCPUFeatures) (MONO_CPU_X86_FULL_SSEAVX_COMBINED & ~feature);

#elif defined(TARGET_ARM64)
// TODO: neon, sha1, sha2, asimd, etc...
if (!strcmp (attr + prefix, "base"))
feature = MONO_CPU_ARM64_BASE;
else if (!strcmp (attr + prefix, "crc"))
feature = MONO_CPU_ARM64_CRC;
#elif defined(TARGET_WASM)
if (!strcmp (attr + prefix, "simd"))
feature = MONO_CPU_WASM_SIMD;
Expand Down
12 changes: 12 additions & 0 deletions src/mono/mono/mini/llvm-intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,18 @@ INTRINS_OVR(WASM_ANYTRUE_V8, wasm_anytrue)
INTRINS_OVR(WASM_ANYTRUE_V4, wasm_anytrue)
INTRINS_OVR(WASM_ANYTRUE_V2, wasm_anytrue)
#endif
#if defined(TARGET_ARM64)
INTRINS_OVR(BITREVERSE_I32, bitreverse)
INTRINS_OVR(BITREVERSE_I64, bitreverse)
INTRINS(AARCH64_CRC32B, aarch64_crc32b)
INTRINS(AARCH64_CRC32H, aarch64_crc32h)
INTRINS(AARCH64_CRC32W, aarch64_crc32w)
INTRINS(AARCH64_CRC32X, aarch64_crc32x)
INTRINS(AARCH64_CRC32CB, aarch64_crc32cb)
INTRINS(AARCH64_CRC32CH, aarch64_crc32ch)
INTRINS(AARCH64_CRC32CW, aarch64_crc32cw)
INTRINS(AARCH64_CRC32CX, aarch64_crc32cx)
#endif

#undef INTRINS
#undef INTRINS_OVR
Expand Down
14 changes: 14 additions & 0 deletions src/mono/mono/mini/mini-arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,20 @@ mono_arch_fregname (int reg)
return "unknown fp";
}

const char *
mono_arch_xregname (int reg)
{
static const char * rnames[] = {
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
"v30", "v31"
};
if (reg >= 0 && reg < 32)
return rnames [reg];
return "unknown";
}

int
mono_arch_get_argument_info (MonoMethodSignature *csig, int param_count, MonoJitArgumentInfo *arg_info)
{
Expand Down
9 changes: 9 additions & 0 deletions src/mono/mono/mini/mini-arm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@

#define MONO_MAX_IREGS 32
#define MONO_MAX_FREGS 32
#define MONO_MAX_XREGS 32

#if !defined(DISABLE_SIMD) && defined(ENABLE_NETCORE)
#define MONO_ARCH_SIMD_INTRINSICS 1
#endif

#define MONO_CONTEXT_SET_LLVM_EXC_REG(ctx, exc) do { (ctx)->regs [0] = (gsize)exc; } while (0)

Expand All @@ -41,6 +46,10 @@
/* v8..v15 */
#define MONO_ARCH_CALLEE_SAVED_FREGS 0xff00

#define MONO_ARCH_CALLEE_SAVED_XREGS 0

#define MONO_ARCH_CALLEE_XREGS MONO_ARCH_CALLEE_FREGS

#define MONO_ARCH_USE_FPSTACK FALSE

#define MONO_ARCH_INST_SREG2_MASK(ins) (0)
Expand Down
93 changes: 84 additions & 9 deletions src/mono/mono/mini/mini-llvm.c
Original file line number Diff line number Diff line change
Expand Up @@ -8657,14 +8657,6 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
case OP_POPCNT64:
values [ins->dreg] = call_intrins (ctx, INTRINS_CTPOP_I64, &lhs, "");
break;
case OP_LZCNT32:
case OP_LZCNT64: {
LLVMValueRef args [2];
args [0] = lhs;
args [1] = LLVMConstInt (LLVMInt1Type (), 1, FALSE);
values [ins->dreg] = call_intrins (ctx, ins->opcode == OP_LZCNT32 ? INTRINS_CTLZ_I32 : INTRINS_CTLZ_I64, args, "");
break;
}
case OP_CTTZ32:
case OP_CTTZ64: {
LLVMValueRef args [2];
Expand Down Expand Up @@ -8724,7 +8716,78 @@ process_bb (EmitContext *ctx, MonoBasicBlock *bb)
break;
}
#endif /* ENABLE_NETCORE */
#endif /* SIMD */
#endif /* defined(TARGET_X86) || defined(TARGET_AMD64) */

// Shared between ARM64 and X86
#if defined(ENABLE_NETCORE) && (defined(TARGET_ARM64) || defined(TARGET_X86) || defined(TARGET_AMD64))
case OP_LZCNT32:
case OP_LZCNT64: {
LLVMValueRef args [2];
args [0] = lhs;
args [1] = LLVMConstInt (LLVMInt1Type (), 1, FALSE);
values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, ins->opcode == OP_LZCNT32 ? INTRINS_CTLZ_I32 : INTRINS_CTLZ_I64), args, 2, "");
break;
}
#endif

#if defined(ENABLE_NETCORE) && defined(TARGET_ARM64)
case OP_XOP_I4_I4:
case OP_XOP_I8_I8: {
IntrinsicId id = (IntrinsicId)0;
switch (ins->inst_c0) {
case SIMD_OP_ARM64_RBIT32: id = INTRINS_BITREVERSE_I32; break;
case SIMD_OP_ARM64_RBIT64: id = INTRINS_BITREVERSE_I64; break;
default: g_assert_not_reached (); break;
}
values [ins->dreg] = call_intrins (ctx, id, &lhs, "");
break;
}
case OP_XOP_I4_I4_I4:
case OP_XOP_I4_I4_I8: {
IntrinsicId id = (IntrinsicId)0;
gboolean zext_last = FALSE;
switch (ins->inst_c0) {
case SIMD_OP_ARM64_CRC32B: id = INTRINS_AARCH64_CRC32B; zext_last = TRUE; break;
case SIMD_OP_ARM64_CRC32H: id = INTRINS_AARCH64_CRC32H; zext_last = TRUE; break;
case SIMD_OP_ARM64_CRC32W: id = INTRINS_AARCH64_CRC32W; zext_last = TRUE; break;
case SIMD_OP_ARM64_CRC32X: id = INTRINS_AARCH64_CRC32X; break;
case SIMD_OP_ARM64_CRC32CB: id = INTRINS_AARCH64_CRC32CB; zext_last = TRUE; break;
case SIMD_OP_ARM64_CRC32CH: id = INTRINS_AARCH64_CRC32CH; zext_last = TRUE; break;
case SIMD_OP_ARM64_CRC32CW: id = INTRINS_AARCH64_CRC32CW; zext_last = TRUE; break;
case SIMD_OP_ARM64_CRC32CX: id = INTRINS_AARCH64_CRC32CX; break;
default: g_assert_not_reached (); break;
}
LLVMValueRef arg1 = rhs;
if (zext_last)
arg1 = LLVMBuildZExt (ctx->builder, arg1, LLVMInt32Type (), "");
LLVMValueRef args [] = { lhs, arg1 };
values [ins->dreg] = call_intrins (ctx, id, args, "");
break;
}
case OP_LSCNT32:
case OP_LSCNT64: {
// %shr = ashr i32 %x, 31
// %xor = xor i32 %shr, %x
// %mul = shl i32 %xor, 1
// %add = or i32 %mul, 1
// %0 = tail call i32 @llvm.ctlz.i32(i32 %add, i1 false)
LLVMValueRef shr = LLVMBuildAShr (builder, lhs, ins->opcode == OP_LSCNT32 ?
LLVMConstInt (LLVMInt32Type (), 31, FALSE) :
LLVMConstInt (LLVMInt64Type (), 63, FALSE), "");
LLVMValueRef one = ins->opcode == OP_LSCNT32 ?
LLVMConstInt (LLVMInt32Type (), 1, FALSE) :
LLVMConstInt (LLVMInt64Type (), 1, FALSE);
LLVMValueRef xor = LLVMBuildXor (builder, shr, lhs, "");
LLVMValueRef mul = LLVMBuildShl (builder, xor, one, "");
LLVMValueRef add = LLVMBuildOr (builder, mul, one, "");

LLVMValueRef args [2];
args [0] = add;
args [1] = LLVMConstInt (LLVMInt1Type (), 0, FALSE);
values [ins->dreg] = LLVMBuildCall (builder, get_intrins (ctx, ins->opcode == OP_LSCNT32 ? INTRINS_CTLZ_I32 : INTRINS_CTLZ_I64), args, 2, "");
break;
}
#endif

case OP_DUMMY_USE:
break;
Expand Down Expand Up @@ -10130,6 +10193,14 @@ add_intrinsic (LLVMModuleRef module, int id)
case INTRINS_WASM_ANYTRUE_V2:
intrins = add_intrins1 (module, id, sse_i8_t);
break;
#endif
#ifdef TARGET_ARM64
case INTRINS_BITREVERSE_I32:
intrins = add_intrins1 (module, id, LLVMInt32Type ());
break;
case INTRINS_BITREVERSE_I64:
intrins = add_intrins1 (module, id, LLVMInt64Type ());
break;
#endif
default:
g_assert_not_reached ();
Expand Down Expand Up @@ -11484,9 +11555,13 @@ MonoCPUFeatures mono_llvm_get_cpu_features (void)
{ "lzcnt", MONO_CPU_X86_LZCNT },
{ "bmi", MONO_CPU_X86_BMI1 },
{ "bmi2", MONO_CPU_X86_BMI2 },
#endif
#if defined(TARGET_ARM64)
{ "crc", MONO_CPU_ARM64_CRC },
#endif
};
if (!cpu_features)
cpu_features = MONO_CPU_INITED | (MonoCPUFeatures)mono_llvm_check_cpu_features (flags_map, G_N_ELEMENTS (flags_map));

return cpu_features;
}
100 changes: 57 additions & 43 deletions src/mono/mono/mini/mini-ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -785,6 +785,53 @@ MINI_OP(OP_NOT_NULL, "not_null", NONE, IREG, NONE)

/* SIMD opcodes. */

#if defined(TARGET_X86) || defined(TARGET_AMD64) || defined(TARGET_WASM) || defined(TARGET_ARM64)

MINI_OP(OP_EXTRACT_I4, "extract_i4", IREG, XREG, NONE)
MINI_OP(OP_ICONV_TO_R4_RAW, "iconv_to_r4_raw", FREG, IREG, NONE)

MINI_OP(OP_EXTRACT_I2, "extract_i2", IREG, XREG, NONE)
MINI_OP(OP_EXTRACT_U2, "extract_u2", IREG, XREG, NONE)
MINI_OP(OP_EXTRACT_I1, "extract_i1", IREG, XREG, NONE)
MINI_OP(OP_EXTRACT_U1, "extract_u1", IREG, XREG, NONE)
MINI_OP(OP_EXTRACT_R4, "extract_r4", FREG, XREG, NONE)
MINI_OP(OP_EXTRACT_R8, "extract_r8", FREG, XREG, NONE)
MINI_OP(OP_EXTRACT_I8, "extract_i8", LREG, XREG, NONE)

/* Used by LLVM */
MINI_OP(OP_INSERT_I1, "insert_i1", XREG, XREG, IREG)
MINI_OP(OP_INSERT_I2, "insert_i2", XREG, XREG, IREG)
MINI_OP(OP_INSERT_I4, "insert_i4", XREG, XREG, IREG)
MINI_OP(OP_INSERT_I8, "insert_i8", XREG, XREG, LREG)
MINI_OP(OP_INSERT_R4, "insert_r4", XREG, XREG, FREG)
MINI_OP(OP_INSERT_R8, "insert_r8", XREG, XREG, FREG)

MINI_OP(OP_EXTRACTX_U2, "extractx_u2", IREG, XREG, NONE)

/*these slow ops are modeled around the availability of a fast 2 bytes insert op*/
/*insertx_u1_slow takes old value and new value as source regs */
MINI_OP(OP_INSERTX_U1_SLOW, "insertx_u1_slow", XREG, IREG, IREG)
/*insertx_i4_slow takes target xreg and new value as source regs */
MINI_OP(OP_INSERTX_I4_SLOW, "insertx_i4_slow", XREG, XREG, IREG)

MINI_OP(OP_INSERTX_R4_SLOW, "insertx_r4_slow", XREG, XREG, FREG)
MINI_OP(OP_INSERTX_R8_SLOW, "insertx_r8_slow", XREG, XREG, FREG)
MINI_OP(OP_INSERTX_I8_SLOW, "insertx_i8_slow", XREG, XREG, LREG)

MINI_OP(OP_FCONV_TO_R4_X, "fconv_to_r4_x", XREG, FREG, NONE)
MINI_OP(OP_FCONV_TO_R8_X, "fconv_to_r8_x", XREG, FREG, NONE)
MINI_OP(OP_XCONV_R8_TO_I4, "xconv_r8_to_i4", IREG, XREG, NONE)
MINI_OP(OP_ICONV_TO_X, "iconv_to_x", XREG, IREG, NONE)

MINI_OP(OP_EXPAND_I1, "expand_i1", XREG, IREG, NONE)
MINI_OP(OP_EXPAND_I2, "expand_i2", XREG, IREG, NONE)
MINI_OP(OP_EXPAND_I4, "expand_i4", XREG, IREG, NONE)
MINI_OP(OP_EXPAND_R4, "expand_r4", XREG, FREG, NONE)
MINI_OP(OP_EXPAND_I8, "expand_i8", XREG, IREG, NONE)
MINI_OP(OP_EXPAND_R8, "expand_r8", XREG, FREG, NONE)

#endif

#if defined(TARGET_X86) || defined(TARGET_AMD64) || defined(TARGET_WASM)

MINI_OP(OP_ADDPS, "addps", XREG, XREG, XREG)
Expand Down Expand Up @@ -945,49 +992,6 @@ MINI_OP(OP_PSHLD_REG, "pshld_reg", XREG, XREG, XREG)
MINI_OP(OP_PSHLQ, "pshlq", XREG, XREG, NONE)
MINI_OP(OP_PSHLQ_REG, "pshlq_reg", XREG, XREG, XREG)

MINI_OP(OP_EXTRACT_I4, "extract_i4", IREG, XREG, NONE)
MINI_OP(OP_ICONV_TO_R4_RAW, "iconv_to_r4_raw", FREG, IREG, NONE)

MINI_OP(OP_EXTRACT_I2, "extract_i2", IREG, XREG, NONE)
MINI_OP(OP_EXTRACT_U2, "extract_u2", IREG, XREG, NONE)
MINI_OP(OP_EXTRACT_I1, "extract_i1", IREG, XREG, NONE)
MINI_OP(OP_EXTRACT_U1, "extract_u1", IREG, XREG, NONE)
MINI_OP(OP_EXTRACT_R4, "extract_r4", FREG, XREG, NONE)
MINI_OP(OP_EXTRACT_R8, "extract_r8", FREG, XREG, NONE)
MINI_OP(OP_EXTRACT_I8, "extract_i8", LREG, XREG, NONE)

/* Used by LLVM */
MINI_OP(OP_INSERT_I1, "insert_i1", XREG, XREG, IREG)
MINI_OP(OP_INSERT_I2, "insert_i2", XREG, XREG, IREG)
MINI_OP(OP_INSERT_I4, "insert_i4", XREG, XREG, IREG)
MINI_OP(OP_INSERT_I8, "insert_i8", XREG, XREG, LREG)
MINI_OP(OP_INSERT_R4, "insert_r4", XREG, XREG, FREG)
MINI_OP(OP_INSERT_R8, "insert_r8", XREG, XREG, FREG)

MINI_OP(OP_EXTRACTX_U2, "extractx_u2", IREG, XREG, NONE)

/*these slow ops are modeled around the availability of a fast 2 bytes insert op*/
/*insertx_u1_slow takes old value and new value as source regs */
MINI_OP(OP_INSERTX_U1_SLOW, "insertx_u1_slow", XREG, IREG, IREG)
/*insertx_i4_slow takes target xreg and new value as source regs */
MINI_OP(OP_INSERTX_I4_SLOW, "insertx_i4_slow", XREG, XREG, IREG)

MINI_OP(OP_INSERTX_R4_SLOW, "insertx_r4_slow", XREG, XREG, FREG)
MINI_OP(OP_INSERTX_R8_SLOW, "insertx_r8_slow", XREG, XREG, FREG)
MINI_OP(OP_INSERTX_I8_SLOW, "insertx_i8_slow", XREG, XREG, LREG)

MINI_OP(OP_FCONV_TO_R4_X, "fconv_to_r4_x", XREG, FREG, NONE)
MINI_OP(OP_FCONV_TO_R8_X, "fconv_to_r8_x", XREG, FREG, NONE)
MINI_OP(OP_XCONV_R8_TO_I4, "xconv_r8_to_i4", IREG, XREG, NONE)
MINI_OP(OP_ICONV_TO_X, "iconv_to_x", XREG, IREG, NONE)

MINI_OP(OP_EXPAND_I1, "expand_i1", XREG, IREG, NONE)
MINI_OP(OP_EXPAND_I2, "expand_i2", XREG, IREG, NONE)
MINI_OP(OP_EXPAND_I4, "expand_i4", XREG, IREG, NONE)
MINI_OP(OP_EXPAND_R4, "expand_r4", XREG, FREG, NONE)
MINI_OP(OP_EXPAND_I8, "expand_i8", XREG, IREG, NONE)
MINI_OP(OP_EXPAND_R8, "expand_r8", XREG, FREG, NONE)

MINI_OP(OP_PREFETCH_MEMBASE, "prefetch_membase", NONE, IREG, NONE)

MINI_OP(OP_CVTDQ2PD, "cvtdq2pd", XREG, XREG, NONE)
Expand Down Expand Up @@ -1522,6 +1526,11 @@ MINI_OP(OP_XOP_I8_X, "xop_i8_x", LREG, XREG, NONE)
MINI_OP(OP_XOP_X_X_X, "xop_x_x_x", XREG, XREG, XREG)
MINI_OP(OP_XOP_X_X_I4, "xop_x_x_i4", XREG, XREG, IREG)
MINI_OP(OP_XOP_X_X_I8, "xop_x_x_i8", XREG, XREG, LREG)
MINI_OP(OP_XOP_I4_I8, "xop_i4_i8", IREG, LREG, NONE)
MINI_OP(OP_XOP_I8_I8, "xop_i8_i8", LREG, LREG, NONE)
MINI_OP(OP_XOP_I4_I4, "xop_i4_i4", IREG, IREG, NONE)
MINI_OP(OP_XOP_I4_I4_I4, "xop_i4_i4_i4", IREG, IREG, IREG)
MINI_OP(OP_XOP_I4_I4_I8, "xop_i4_i4_i8", IREG, IREG, LREG)

MINI_OP(OP_XCAST, "xcast", XREG, XREG, NONE)
/* Extract element of vector */
Expand All @@ -1546,3 +1555,8 @@ MINI_OP(OP_LZCNT32, "lzcnt32", IREG, IREG, NONE)
MINI_OP(OP_LZCNT64, "lzcnt64", LREG, LREG, NONE)
MINI_OP(OP_POPCNT32, "popcnt32", IREG, IREG, NONE)
MINI_OP(OP_POPCNT64, "popcnt64", LREG, LREG, NONE)

#ifdef TARGET_ARM64
MINI_OP(OP_LSCNT32, "lscnt32", IREG, IREG, NONE)
MINI_OP(OP_LSCNT64, "lscnt64", LREG, LREG, NONE)
#endif // TARGET_ARM64
5 changes: 5 additions & 0 deletions src/mono/mono/mini/mini.c
Original file line number Diff line number Diff line change
Expand Up @@ -4346,6 +4346,11 @@ mini_get_cpu_features (MonoCompile* cfg)
}
#endif

#if defined(TARGET_ARM64)
// All Arm64 devices have this set
features |= MONO_CPU_ARM64_BASE;
#endif
EgorBo marked this conversation as resolved.
Show resolved Hide resolved

// apply parameters passed via -mattr
return (features | mono_cpu_features_enabled) & ~mono_cpu_features_disabled;
}
21 changes: 20 additions & 1 deletion src/mono/mono/mini/mini.h
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,12 @@ enum {
#define MONO_IS_REAL_MOVE(ins) (((ins)->opcode == OP_MOVE) || ((ins)->opcode == OP_FMOVE) || ((ins)->opcode == OP_XMOVE) || ((ins)->opcode == OP_RMOVE))
#define MONO_IS_ZERO(ins) (((ins)->opcode == OP_VZERO) || ((ins)->opcode == OP_XZERO))

#ifdef TARGET_ARM64
// FIXME: enable for Arm64
#define MONO_CLASS_IS_SIMD(cfg, klass) (0)
#else
#define MONO_CLASS_IS_SIMD(cfg, klass) (((cfg)->opt & MONO_OPT_SIMD) && m_class_is_simd_type (klass))
#endif

#else

Expand Down Expand Up @@ -2842,6 +2847,10 @@ typedef enum {
#ifdef TARGET_WASM
MONO_CPU_WASM_SIMD = 1 << 1,
#endif
#ifdef TARGET_ARM64
MONO_CPU_ARM64_BASE = 1 << 1,
MONO_CPU_ARM64_CRC = 1 << 2,
#endif
} MonoCPUFeatures;

G_ENUM_FUNCTIONS (MonoCPUFeatures)
Expand Down Expand Up @@ -2936,7 +2945,17 @@ typedef enum {
SIMD_OP_SSE_PSIGND,
SIMD_OP_SSE_PMADDUBSW,
SIMD_OP_SSE_PMULHRSW,
SIMD_OP_SSE_LDDQU
SIMD_OP_SSE_LDDQU,
SIMD_OP_ARM64_CRC32B,
SIMD_OP_ARM64_CRC32H,
SIMD_OP_ARM64_CRC32W,
SIMD_OP_ARM64_CRC32X,
SIMD_OP_ARM64_CRC32CB,
SIMD_OP_ARM64_CRC32CH,
SIMD_OP_ARM64_CRC32CW,
SIMD_OP_ARM64_CRC32CX,
SIMD_OP_ARM64_RBIT32,
SIMD_OP_ARM64_RBIT64
} SimdOp;

const char *mono_arch_xregname (int reg);
Expand Down
Loading