Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mono]: Add support for callee saved XMM registers on Windows x64. #97326

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/mono/mono/arch/amd64/amd64-codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,9 @@ typedef enum

#define AMD64_CALLEE_SAVED_REGS ((1<<AMD64_RDI) | (1<<AMD64_RSI) | (1<<AMD64_RBX) | (1<<AMD64_R12) | (1<<AMD64_R13) | (1<<AMD64_R14) | (1<<AMD64_R15) | (1<<AMD64_RBP))
#define AMD64_IS_CALLEE_SAVED_REG(reg) (AMD64_CALLEE_SAVED_REGS & (1 << (reg)))

#define AMD64_CALLEE_SAVED_XREGS ((1<<AMD64_XMM6) | (1<<AMD64_XMM7) | (1<<AMD64_XMM8) | (1<<AMD64_XMM9) | (1<<AMD64_XMM10) | (1<<AMD64_XMM11) | (1<<AMD64_XMM12) | (1<<AMD64_XMM13) | (1<<AMD64_XMM14) | (1<<AMD64_XMM15))
#define AMD64_IS_CALLEE_SAVED_XREG(reg) (AMD64_CALLEE_SAVED_XREGS & (1 << (reg)))
#else
#define AMD64_CALLEE_REGS ((1<<AMD64_RAX) | (1<<AMD64_RCX) | (1<<AMD64_RDX) | (1<<AMD64_RSI) | (1<<AMD64_RDI) | (1<<AMD64_R8) | (1<<AMD64_R9) | (1<<AMD64_R10))
#define AMD64_IS_CALLEE_REG(reg) (AMD64_CALLEE_REGS & (1 << (reg)))
Expand Down
141 changes: 132 additions & 9 deletions src/mono/mono/mini/exceptions-amd64.c
Original file line number Diff line number Diff line change
Expand Up @@ -258,13 +258,27 @@ mono_arch_get_restore_context (MonoTrampInfo **info, gboolean aot)

amd64_mov_reg_reg (code, AMD64_R11, AMD64_ARG_REG1, 8);

/* Restore all registers except %rip and %r11 */
/* Restore all general registers except %rip and %r11 */
gregs_offset = MONO_STRUCT_OFFSET (MonoContext, gregs);
for (i = 0; i < AMD64_NREG; ++i) {
if (i != AMD64_RIP && i != AMD64_RSP && i != AMD64_R8 && i != AMD64_R9 && i != AMD64_R10 && i != AMD64_R11)
amd64_mov_reg_membase (code, i, AMD64_R11, gregs_offset + (i * 8), 8);
}

#ifdef AMD64_CALLEE_SAVED_XREGS
/* Restore all callee saved XMM registers */
int fregs_offset = MONO_STRUCT_OFFSET (MonoContext, fregs);
for (i = 0; i < AMD64_XMM_NREG; ++i) {
if (AMD64_IS_CALLEE_SAVED_XREG (i)) {
#if defined(MONO_HAVE_SIMD_REG)
amd64_movdqu_reg_membase (code, i, AMD64_R11, fregs_offset + (i * sizeof (MonoContextSimdReg)));
#else
amd64_movsd_reg_membase (code, i, AMD64_R11, fregs_offset + (i * sizeof (double)));
#endif
}
}
#endif

/*
* The context resides on the stack, in the stack frame of the
* caller of this function. The stack pointer that we need to
Expand Down Expand Up @@ -467,7 +481,11 @@ get_throw_trampoline (MonoTrampInfo **info, gboolean rethrow, gboolean corlib, g
MonoJumpInfo *ji = NULL;
GSList *unwind_ops = NULL;
int i, stack_size, arg_offsets [16], ctx_offset, regs_offset;
#ifdef AMD64_CALLEE_SAVED_XREGS
const int kMaxCodeSize = 300;
#else
const int kMaxCodeSize = 256;
#endif

#ifdef TARGET_WIN32
const int dummy_stack_space = 6 * sizeof (target_mgreg_t); /* Windows expects stack space allocated for all 6 dummy args. */
Expand Down Expand Up @@ -517,6 +535,20 @@ get_throw_trampoline (MonoTrampInfo **info, gboolean rethrow, gboolean corlib, g
/* Save IP */
amd64_mov_reg_membase (code, AMD64_RAX, AMD64_RSP, stack_size, sizeof (target_mgreg_t));
amd64_mov_membase_reg (code, AMD64_RSP, regs_offset + (AMD64_RIP * sizeof (target_mgreg_t)), AMD64_RAX, sizeof (target_mgreg_t));

#ifdef AMD64_CALLEE_SAVED_XREGS
int fregs_offset = ctx_offset + MONO_STRUCT_OFFSET (MonoContext, fregs);
for (i = 0; i < AMD64_XMM_NREG; ++i) {
if (AMD64_IS_CALLEE_SAVED_XREG (i)) {
#if defined(MONO_HAVE_SIMD_REG)
amd64_movdqu_membase_reg (code, AMD64_RSP, fregs_offset + (i * sizeof (MonoContextSimdReg)), i);
#else
amd64_movsd_membase_reg (code, AMD64_RSP, fregs_offset + (i * sizeof (double)), i);
#endif
}
}
#endif

/* Set arg1 == ctx */
amd64_lea_membase (code, AMD64_RAX, AMD64_RSP, ctx_offset);
amd64_mov_membase_reg (code, AMD64_RSP, arg_offsets [0], AMD64_RAX, sizeof (target_mgreg_t));
Expand Down Expand Up @@ -617,6 +649,75 @@ mono_arch_get_throw_corlib_exception (MonoTrampInfo **info, gboolean aot)
}
#endif /* !DISABLE_JIT */

#if defined(AMD64_CALLEE_SAVED_XREGS) && defined(TARGET_WIN32)

static gboolean
unwind_llvm_frame_win64 (
gpointer ip,
mono_unwind_reg_t *regs,
int nregs,
gboolean readonly_regs,
host_mgreg_t **save_locations,
int save_locations_len)
{
gboolean result = FALSE;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be better to either fix llvm to emit dwarf unwind info for these, or add this unwind info to the generic info when thats loaded.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should move back to just use windows unwind ops for llvm frames, right now we enable dwarf + native unwind codes in llvm generated code, and then dwarf is not following the windows ABI (since it misses the XMM registers) since the dwarf implementation was not meant to be used under Windows. It is probably better to add handling of additional native unwind codes for llvm frames and remove the patch that emits dwarf for llvm frames. That will also remove the duplication that we currently carry in the Windows x64 images. That change could be done as a follow up to this PR, if needed.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vargaz @lateralusX could you create a GH issue for the follow up work

DWORD64 address = (DWORD64)ip;
DWORD64 image_base = 0;
UNWIND_HISTORY_TABLE entry = { 0 };

if (RtlLookupFunctionEntry(address, &image_base, &entry)) {
for (DWORD i = 0; i < entry.Count; i++) {
DWORD64 start = entry.Entry[i].ImageBase + entry.Entry[i].FunctionEntry->BeginAddress;
DWORD64 end = entry.Entry[i].ImageBase + entry.Entry[i].FunctionEntry->EndAddress;
DWORD64 start_offset = address - start;
UNWIND_INFO* unwind_info_address = (UNWIND_INFO*)(entry.Entry[i].ImageBase + entry.Entry[i].FunctionEntry->UnwindInfoAddress);
guint8 *fixed_stack_loc = NULL;

if (!unwind_info_address->FrameRegister) {
g_assert (nregs > AMD64_RSP);
fixed_stack_loc = (guint8 *)(regs [AMD64_RSP]);
} else {
g_assert (nregs > unwind_info_address->FrameRegister);
fixed_stack_loc = (guint8 *)((guint8 *)(regs [unwind_info_address->FrameRegister]) - (unwind_info_address->FrameOffset * 16));
}

g_assert (fixed_stack_loc);

if (start <= address && end >= address) {
for (guchar j = 0; j < unwind_info_address->CountOfCodes; j++) {
if (start_offset > unwind_info_address->UnwindCode[j].CodeOffset) {
if (unwind_info_address->UnwindCode[j].UnwindOp == UWOP_SAVE_XMM128 || unwind_info_address->UnwindCode[j].UnwindOp == UWOP_SAVE_XMM128_FAR) {
int reg = AMD64_NREG + unwind_info_address->UnwindCode[j].OpInfo;
guint8 *offset = fixed_stack_loc;
if (unwind_info_address->UnwindCode[j].UnwindOp == UWOP_SAVE_XMM128) {
g_assert (j + 1 < unwind_info_address->CountOfCodes);
offset = offset + unwind_info_address->UnwindCode[j+1].FrameOffset * 16;
j += 1;
}
if (unwind_info_address->UnwindCode[j].UnwindOp == UWOP_SAVE_XMM128_FAR) {
g_assert (j + 2 < unwind_info_address->CountOfCodes);
offset = offset + (((guint)(unwind_info_address->UnwindCode[j+1].FrameOffset << 16)) | ((guint)unwind_info_address->UnwindCode[j+2].FrameOffset));
j += 2;
}

if (!readonly_regs && regs && (reg < nregs))
regs [reg] = GUINT64_TO_HMREG (*(guint64*)(offset));

if (save_locations && (reg < save_locations_len) && !save_locations [reg])
save_locations [reg] = (host_mgreg_t *)offset;
}
}
}
}
}

result = TRUE;
}

return result;
}
#endif

/*
* mono_arch_unwind_frame:
*
Expand All @@ -641,7 +742,14 @@ mono_arch_unwind_frame (MonoJitTlsData *jit_tls,
*new_ctx = *ctx;

if (ji != NULL) {
host_mgreg_t regs [MONO_MAX_IREGS + 1];
#ifdef AMD64_CALLEE_SAVED_XREGS
host_mgreg_t *restored_regs [AMD64_NREG + AMD64_XMM_NREG];
#else
host_mgreg_t *restored_regs [AMD64_NREG];
#endif

const int restored_regs_len = G_N_ELEMENTS (restored_regs);

guint8 *cfa;
guint32 unwind_info_len;
guint8 *unwind_info;
Expand All @@ -665,19 +773,34 @@ mono_arch_unwind_frame (MonoJitTlsData *jit_tls,
if (ji->has_arch_eh_info)
epilog = (guint8*)ji->code_start + ji->code_size - mono_jinfo_get_epilog_size (ji);

for (i = 0; i < AMD64_NREG; ++i)
regs [i] = new_ctx->gregs [i];

gboolean success = mono_unwind_frame (unwind_info, unwind_info_len, (guint8 *)ji->code_start,
(guint8*)ji->code_start + ji->code_size,
(guint8 *)ip, epilog ? &epilog : NULL, regs, MONO_MAX_IREGS + 1,
save_locations, MONO_MAX_IREGS, &cfa);
(guint8 *)ip, epilog ? &epilog : NULL,
new_ctx->gregs, AMD64_NREG, TRUE,
restored_regs, restored_regs_len, &cfa);

#if defined(AMD64_CALLEE_SAVED_XREGS) && defined(TARGET_WIN32)
if (ji->from_llvm)
success &= unwind_llvm_frame_win64 ((guint8 *)ip, new_ctx->gregs, AMD64_NREG, TRUE, restored_regs, restored_regs_len);
#endif

if (!success)
return FALSE;

for (i = 0; i < AMD64_NREG; ++i)
new_ctx->gregs [i] = regs [i];
for (i = 0; i < AMD64_NREG; ++i) {
if (i < restored_regs_len && restored_regs [i])
new_ctx->gregs [i] = *(restored_regs [i]);
}

#ifdef AMD64_CALLEE_SAVED_XREGS
for (i = 0; i < AMD64_XMM_NREG; ++i) {
if (AMD64_NREG + i < restored_regs_len && restored_regs [AMD64_NREG + i])
memcpy (&(new_ctx->fregs [i]), restored_regs [AMD64_NREG + i], sizeof (new_ctx->fregs [0]));
}
#endif

if (save_locations)
memcpy (save_locations, restored_regs, MONO_MAX_IREGS * sizeof (restored_regs [0]));

/* The CFA becomes the new SP value */
new_ctx->gregs [AMD64_RSP] = (host_mgreg_t)(gsize)cfa;
Expand Down
2 changes: 1 addition & 1 deletion src/mono/mono/mini/exceptions-arm.c
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,7 @@ mono_arch_unwind_frame (MonoJitTlsData *jit_tls,

gboolean success = mono_unwind_frame (unwind_info, unwind_info_len, (guint8*)ji->code_start,
(guint8*)ji->code_start + ji->code_size,
(guint8*)ip, NULL, regs, MONO_MAX_IREGS + 8,
(guint8*)ip, NULL, regs, MONO_MAX_IREGS + 8, FALSE,
save_locations, MONO_MAX_IREGS, &cfa);

if (!success)
Expand Down
2 changes: 1 addition & 1 deletion src/mono/mono/mini/exceptions-arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ mono_arch_unwind_frame (MonoJitTlsData *jit_tls,
gpointer ip = MINI_FTNPTR_TO_ADDR (MONO_CONTEXT_GET_IP (ctx));
gboolean success = mono_unwind_frame (unwind_info, unwind_info_len, (guint8*)ji->code_start,
(guint8*)ji->code_start + ji->code_size,
(guint8*)ip, NULL, regs, MONO_MAX_IREGS + 8,
(guint8*)ip, NULL, regs, MONO_MAX_IREGS + 8, FALSE,
save_locations, MONO_MAX_IREGS, (guint8**)&cfa);

if (!success)
Expand Down
2 changes: 1 addition & 1 deletion src/mono/mono/mini/exceptions-ppc.c
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ mono_arch_unwind_frame (MonoJitTlsData *jit_tls,

gboolean success = mono_unwind_frame (unwind_info, unwind_info_len, (guint8*)ji->code_start,
(guint8*)ji->code_start + ji->code_size,
(guint8*)ip, NULL, regs, ppc_lr + 1,
(guint8*)ip, NULL, regs, ppc_lr + 1, FALSE,
save_locations, MONO_MAX_IREGS, &cfa);

if (!success)
Expand Down
2 changes: 1 addition & 1 deletion src/mono/mono/mini/exceptions-riscv.c
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ mono_arch_unwind_frame (MonoJitTlsData *jit_tls, MonoJitInfo *ji,

gboolean success = mono_unwind_frame (unwind_info, unwind_info_len, (guint8 *)ji->code_start,
(guint8 *)ji->code_start + ji->code_size, (guint8 *)ip, NULL, regs,
MONO_MAX_IREGS + 12 + 1, save_locations, MONO_MAX_IREGS, (guint8 **)&cfa);
MONO_MAX_IREGS + 12 + 1, FALSE, save_locations, MONO_MAX_IREGS, (guint8 **)&cfa);

if (!success)
return FALSE;
Expand Down
2 changes: 1 addition & 1 deletion src/mono/mono/mini/exceptions-s390x.c
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,7 @@ mono_arch_unwind_frame (MonoJitTlsData *jit_tls,
memcpy (&regs[16], &ctx->uc_mcontext.fpregs.fprs, 16 * sizeof(host_mgreg_t));
gboolean success = mono_unwind_frame (unwind_info, unwind_info_len, ji->code_start,
(guint8 *) ji->code_start + ji->code_size,
ip, epilog ? &epilog : NULL, regs, 32, save_locations,
ip, epilog ? &epilog : NULL, regs, 32, FALSE, save_locations,
MONO_MAX_IREGS, &cfa);

if (!success)
Expand Down
2 changes: 1 addition & 1 deletion src/mono/mono/mini/exceptions-x86.c
Original file line number Diff line number Diff line change
Expand Up @@ -839,7 +839,7 @@ mono_arch_unwind_frame (MonoJitTlsData *jit_tls,

gboolean success = mono_unwind_frame ((guint8*)unwind_info, unwind_info_len, (guint8*)ji->code_start,
(guint8*)ji->code_start + ji->code_size,
(guint8*)ip, NULL, regs, MONO_MAX_IREGS + 1,
(guint8*)ip, NULL, regs, MONO_MAX_IREGS + 1, FALSE,
save_locations, MONO_MAX_IREGS, &cfa);

if (!success)
Expand Down
49 changes: 49 additions & 0 deletions src/mono/mono/mini/mini-amd64.c
Original file line number Diff line number Diff line change
Expand Up @@ -1782,6 +1782,23 @@ mono_arch_allocate_vars (MonoCompile *cfg)
if (AMD64_IS_CALLEE_SAVED_REG (i) && (cfg->arch.saved_iregs & (1 << i))) {
offset += sizeof (target_mgreg_t);
}

#if defined(AMD64_CALLEE_SAVED_XREGS)
if (cfg->method->save_lmf) {
#if defined(MONO_HAVE_SIMD_REG)
int xreg_size = sizeof (MonoContextSimdReg);
#else
int xreg_size = sizeof (double);
#endif
offset = ALIGN_TO (offset, xreg_size);
for (guint i = 0; i < AMD64_XMM_NREG; ++i) {
if (AMD64_IS_CALLEE_SAVED_XREG (i)) {
offset += xreg_size;
}
}
}
#endif

if (!cfg->arch.omit_fp)
cfg->arch.reg_save_area_offset = -offset;

Expand Down Expand Up @@ -8046,6 +8063,38 @@ MONO_RESTORE_WARNING
}
}

#if defined(AMD64_CALLEE_SAVED_XREGS)
if (method->save_lmf) {
#if defined(MONO_HAVE_SIMD_REG)
int xreg_size = sizeof (MonoContextSimdReg);
#else
int xreg_size = sizeof (double);
#endif
save_area_offset = ALIGN_TO (save_area_offset, xreg_size);

for (guint16 i = 0; i < AMD64_XMM_NREG; ++i) {
if (AMD64_IS_CALLEE_SAVED_XREG (i)) {
#if defined(MONO_HAVE_SIMD_REG)
amd64_movdqu_membase_reg (code, cfg->frame_reg, save_area_offset, i);
#else
amd64_movsd_membase_reg (code, cfg->frame_reg, save_area_offset, i);
#endif
if (cfg->arch.omit_fp) {
mono_emit_unwind_op_offset (cfg, code, AMD64_NREG + i, - (cfa_offset - save_area_offset));
/* These are handled automatically by the stack marking code */
mini_gc_set_slot_type_from_cfa (cfg, - (cfa_offset - save_area_offset), SLOT_NOREF);
} else {
mono_emit_unwind_op_offset (cfg, code, AMD64_NREG + i, - (-save_area_offset + (2 * 8)));
// FIXME: GC
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Re: FIXME - is there some follow up needed for this?

Copy link
Member Author

@lateralusX lateralusX Jan 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That code is mainly a copy/paste, so kept the original comments as well, I guess that fixme have been in there for many many years, so no follow up have been planned, not even sure its relevant anymore, but if we drop it here, we should also drop it from the source where it got copied,

.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect it is some artifact from the times we had precise stack scanning.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If stack scanning is always conservative (is that the case?) then there is nothing needed here for GC.
Perhaps remove the FIXME from both places?

}

save_area_offset += xreg_size;
async_exc_point (code);
}
}
}
#endif

/* store runtime generic context */
if (cfg->rgctx_var) {
g_assert (cfg->rgctx_var->opcode == OP_REGOFFSET &&
Expand Down
24 changes: 13 additions & 11 deletions src/mono/mono/mini/mini-amd64.h
Original file line number Diff line number Diff line change
Expand Up @@ -514,15 +514,7 @@ mono_amd64_get_exception_trampolines (gboolean aot);
int
mono_amd64_get_tls_gs_offset (void);

#if defined(TARGET_WIN32) && !defined(DISABLE_JIT)

#define MONO_ARCH_HAVE_UNWIND_TABLE 1
#define MONO_ARCH_HAVE_CODE_CHUNK_TRACKING 1

#ifdef ENABLE_CHECKED_BUILD
#define ENABLE_CHECKED_BUILD_UNWINDINFO
#endif

#if defined(TARGET_WIN32)
#define MONO_MAX_UNWIND_CODES 22

typedef enum _UNWIND_OP_CODES {
Expand All @@ -532,7 +524,7 @@ typedef enum _UNWIND_OP_CODES {
UWOP_SET_FPREG, /* no info, FP = RSP + UNWIND_INFO.FPRegOffset*16 */
UWOP_SAVE_NONVOL, /* info == register number, offset in next slot */
UWOP_SAVE_NONVOL_FAR, /* info == register number, offset in next 2 slots */
UWOP_SAVE_XMM128, /* info == XMM reg number, offset in next slot */
UWOP_SAVE_XMM128 = 8, /* info == XMM reg number, offset in next slot */
UWOP_SAVE_XMM128_FAR, /* info == XMM reg number, offset in next 2 slots */
UWOP_PUSH_MACHFRAME /* info == 0: no error-code, 1: error-code */
} UNWIND_CODE_OPS;
Expand Down Expand Up @@ -562,6 +554,15 @@ typedef struct _UNWIND_INFO {
* OPTIONAL ULONG ExceptionData[]; */
} UNWIND_INFO, *PUNWIND_INFO;

#if !defined(DISABLE_JIT)

#define MONO_ARCH_HAVE_UNWIND_TABLE 1
#define MONO_ARCH_HAVE_CODE_CHUNK_TRACKING 1

#ifdef ENABLE_CHECKED_BUILD
#define ENABLE_CHECKED_BUILD_UNWINDINFO
#endif

static inline guint
mono_arch_unwindinfo_get_size (guchar code_count)
{
Expand Down Expand Up @@ -603,7 +604,8 @@ mono_arch_code_chunk_new (void *chunk, int size);
void
mono_arch_code_chunk_destroy (void *chunk);

#endif /* defined(TARGET_WIN32) && !defined(DISABLE_JIT) */
#endif /* !defined(DISABLE_JIT) */
#endif /* defined(TARGET_WIN32) */

#ifdef MONO_ARCH_HAVE_UNWIND_TABLE
// Allocate additional size for max 3 unwind ops (push + fp or sp small|large) + unwind info struct trailing code buffer.
Expand Down
2 changes: 1 addition & 1 deletion src/mono/mono/mini/mini-unwind.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ mono_unwind_ops_encode (GSList *unwind_ops, guint32 *out_len);
gboolean
mono_unwind_frame (guint8 *unwind_info, guint32 unwind_info_len,
guint8 *start_ip, guint8 *end_ip, guint8 *ip, guint8 **mark_locations,
mono_unwind_reg_t *regs, int nregs,
mono_unwind_reg_t *regs, int nregs, gboolean readonly_regs,
host_mgreg_t **save_locations, int save_locations_len,
guint8 **out_cfa);

Expand Down
Loading