Skip to content

Commit

Permalink
Wasm jiterpreter cleanup and bug fixes pt. 4 (#79324)
Browse files Browse the repository at this point in the history
* Instead of jitting interp_entry wrappers for every entry point, record a hit count for each one and then queue it for JIT compilation once it's hit enough times
* Add prefs controlling interp entry queue thresholds
* Add support for using imported globals for pointer constants (to enable future threading/caching), off by default
* Use base 36 for jiterpreter import names to save space
  • Loading branch information
kg authored Dec 20, 2022
1 parent 82e9d38 commit 429cda7
Show file tree
Hide file tree
Showing 9 changed files with 207 additions and 71 deletions.
38 changes: 29 additions & 9 deletions src/mono/mono/mini/interp/interp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2884,45 +2884,65 @@ init_arglist (InterpFrame *frame, MonoMethodSignature *sig, stackval *sp, char *
* this/static * ret/void * 16 arguments -> 64 functions.
*/

#if HOST_BROWSER
/*
* For the jiterpreter, we want to record a hit count for interp_entry wrappers that can
* be jitted, but not for ones that can't. As a result we need to put this in its own
* macro instead of in INTERP_ENTRY_BASE, so that the generic wrappers don't have to
* call it on every invocation.
* Once this gets called a few hundred times, the wrapper will be jitted so we'll stop
* paying the cost of the hit counter and the entry will become faster.
*/
#define INTERP_ENTRY_UPDATE_HIT_COUNT(_method) \
if (mono_opt_jiterpreter_interp_entry_enabled) \
mono_interp_record_interp_entry (_method)
#else
#define INTERP_ENTRY_UPDATE_HIT_COUNT(_method)
#endif

#define INTERP_ENTRY_BASE(_method, _this_arg, _res) \
InterpEntryData data; \
(data).rmethod = (_method); \
(data).res = (_res); \
(data).this_arg = (_this_arg); \
(data).many_args = NULL;

#define INTERP_ENTRY_BASE_WITH_HIT_COUNT(_method, _this_arg, _res) \
INTERP_ENTRY_BASE (_method, _this_arg, _res) \
INTERP_ENTRY_UPDATE_HIT_COUNT (_method);

#define INTERP_ENTRY0(_this_arg, _res, _method) { \
INTERP_ENTRY_BASE (_method, _this_arg, _res); \
INTERP_ENTRY_BASE_WITH_HIT_COUNT (_method, _this_arg, _res); \
interp_entry (&data); \
}
#define INTERP_ENTRY1(_this_arg, _res, _method) { \
INTERP_ENTRY_BASE (_method, _this_arg, _res); \
INTERP_ENTRY_BASE_WITH_HIT_COUNT (_method, _this_arg, _res); \
(data).args [0] = arg1; \
interp_entry (&data); \
}
#define INTERP_ENTRY2(_this_arg, _res, _method) { \
INTERP_ENTRY_BASE (_method, _this_arg, _res); \
INTERP_ENTRY_BASE_WITH_HIT_COUNT (_method, _this_arg, _res); \
(data).args [0] = arg1; \
(data).args [1] = arg2; \
interp_entry (&data); \
}
#define INTERP_ENTRY3(_this_arg, _res, _method) { \
INTERP_ENTRY_BASE (_method, _this_arg, _res); \
INTERP_ENTRY_BASE_WITH_HIT_COUNT (_method, _this_arg, _res); \
(data).args [0] = arg1; \
(data).args [1] = arg2; \
(data).args [2] = arg3; \
interp_entry (&data); \
}
#define INTERP_ENTRY4(_this_arg, _res, _method) { \
INTERP_ENTRY_BASE (_method, _this_arg, _res); \
INTERP_ENTRY_BASE_WITH_HIT_COUNT (_method, _this_arg, _res); \
(data).args [0] = arg1; \
(data).args [1] = arg2; \
(data).args [2] = arg3; \
(data).args [3] = arg4; \
interp_entry (&data); \
}
#define INTERP_ENTRY5(_this_arg, _res, _method) { \
INTERP_ENTRY_BASE (_method, _this_arg, _res); \
INTERP_ENTRY_BASE_WITH_HIT_COUNT (_method, _this_arg, _res); \
(data).args [0] = arg1; \
(data).args [1] = arg2; \
(data).args [2] = arg3; \
Expand All @@ -2931,7 +2951,7 @@ init_arglist (InterpFrame *frame, MonoMethodSignature *sig, stackval *sp, char *
interp_entry (&data); \
}
#define INTERP_ENTRY6(_this_arg, _res, _method) { \
INTERP_ENTRY_BASE (_method, _this_arg, _res); \
INTERP_ENTRY_BASE_WITH_HIT_COUNT (_method, _this_arg, _res); \
(data).args [0] = arg1; \
(data).args [1] = arg2; \
(data).args [2] = arg3; \
Expand All @@ -2941,7 +2961,7 @@ init_arglist (InterpFrame *frame, MonoMethodSignature *sig, stackval *sp, char *
interp_entry (&data); \
}
#define INTERP_ENTRY7(_this_arg, _res, _method) { \
INTERP_ENTRY_BASE (_method, _this_arg, _res); \
INTERP_ENTRY_BASE_WITH_HIT_COUNT (_method, _this_arg, _res); \
(data).args [0] = arg1; \
(data).args [1] = arg2; \
(data).args [2] = arg3; \
Expand All @@ -2952,7 +2972,7 @@ init_arglist (InterpFrame *frame, MonoMethodSignature *sig, stackval *sp, char *
interp_entry (&data); \
}
#define INTERP_ENTRY8(_this_arg, _res, _method) { \
INTERP_ENTRY_BASE (_method, _this_arg, _res); \
INTERP_ENTRY_BASE_WITH_HIT_COUNT (_method, _this_arg, _res); \
(data).args [0] = arg1; \
(data).args [1] = arg2; \
(data).args [2] = arg3; \
Expand Down
3 changes: 3 additions & 0 deletions src/mono/mono/mini/interp/jiterpreter.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ jiterp_insert_entry_points (void *td);
void
mono_jiterp_register_jit_call_thunk (void *cinfo, WasmJitCallThunk thunk);

extern void
mono_interp_record_interp_entry (void *fn_ptr);

// jiterpreter-interp-entry.ts
// HACK: Pass void* so that this header can include safely in files without definition for InterpMethod
extern gpointer
Expand Down
13 changes: 10 additions & 3 deletions src/mono/mono/utils/options-def.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,14 +103,21 @@ DEFINE_BOOL(jiterpreter_estimate_heat, "jiterpreter-estimate-heat", FALSE, "Main
DEFINE_BOOL(jiterpreter_count_bailouts, "jiterpreter-count-bailouts", FALSE, "Maintain accurate count of all trace bailouts based on cause")
// Dump the wasm blob for all compiled traces
DEFINE_BOOL(jiterpreter_dump_traces, "jiterpreter-dump-traces", FALSE, "Dump the wasm blob for all compiled traces to the console")
// Use runtime imports for pointer constants
// Currently reduces performance significantly :(
DEFINE_BOOL(jiterpreter_use_constants, "jiterpreter-use-constants", FALSE, "Use runtime imports for pointer constants")
// any trace that doesn't have at least this many meaningful (non-nop) opcodes in it will be rejected
DEFINE_INT(jiterpreter_minimum_trace_length, "jiterpreter-minimum-trace-length", 8, "Reject traces shorter than this number of meaningful opcodes")
// once a trace entry point is inserted, we only actually JIT code for it once it's been hit this many times
DEFINE_INT(jiterpreter_minimum_trace_hit_count, "jiterpreter-minimum-trace-hit-count", 10000, "JIT trace entry points once they are hit this many times")
DEFINE_INT(jiterpreter_minimum_trace_hit_count, "jiterpreter-minimum-trace-hit-count", 5000, "JIT trace entry points once they are hit this many times")
// After a do_jit_call call site is hit this many times, we will queue it to be jitted
DEFINE_INT(jiterpreter_jit_call_trampoline_hit_count, "jiterpreter-jit-call-hit-count", 3000, "Queue specialized do_jit_call trampoline for JIT after this many hits")
DEFINE_INT(jiterpreter_jit_call_trampoline_hit_count, "jiterpreter-jit-call-hit-count", 2000, "Queue specialized do_jit_call trampoline for JIT after this many hits")
// After a do_jit_call call site is hit this many times without being jitted, we will flush the JIT queue
DEFINE_INT(jiterpreter_jit_call_queue_flush_threshold, "jiterpreter-jit-call-queue-flush-threshold", 10000, "Flush the do_jit_call JIT queue after an unJITted call site has this many hits")
DEFINE_INT(jiterpreter_jit_call_queue_flush_threshold, "jiterpreter-jit-call-queue-flush-threshold", 8000, "Flush the do_jit_call JIT queue after an unJITted call site has this many hits")
// After a generic interp_entry wrapper is hit this many times, we will queue it to be jitted
DEFINE_INT(jiterpreter_interp_entry_trampoline_hit_count, "jiterpreter-interp-entry-hit-count", 250, "Queue specialized interp_entry wrapper for JIT after this many hits")
// After a generic interp_entry wrapper is hit this many times without being jitted, we will flush the JIT queue
DEFINE_INT(jiterpreter_interp_entry_queue_flush_threshold, "jiterpreter-interp-entry-queue-flush-threshold", 1000, "Flush the interp_entry JIT queue after an unJITted call site has this many hits")
#endif // HOST_BROWSER

/* Cleanup */
Expand Down
1 change: 1 addition & 0 deletions src/mono/wasm/runtime/es6/dotnet.es6.lib.js
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ const linked_functions = [

// jiterpreter.c / interp.c / transform.c
"mono_interp_tier_prepare_jiterpreter",
"mono_interp_record_interp_entry",
"mono_interp_jit_wasm_entry_trampoline",
"mono_interp_jit_wasm_jit_call_trampoline",
"mono_interp_invoke_wasm_jit_call_trampoline",
Expand Down
3 changes: 2 additions & 1 deletion src/mono/wasm/runtime/exports-linker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import { mono_wasm_load_icu_data, mono_wasm_get_icudt_name } from "./icu";
import { mono_wasm_bind_cs_function } from "./invoke-cs";
import { mono_wasm_bind_js_function, mono_wasm_invoke_bound_function, mono_wasm_invoke_import } from "./invoke-js";
import { mono_interp_tier_prepare_jiterpreter } from "./jiterpreter";
import { mono_interp_jit_wasm_entry_trampoline } from "./jiterpreter-interp-entry";
import { mono_interp_jit_wasm_entry_trampoline, mono_interp_record_interp_entry } from "./jiterpreter-interp-entry";
import { mono_interp_jit_wasm_jit_call_trampoline, mono_interp_invoke_wasm_jit_call_trampoline, mono_interp_flush_jitcall_queue, mono_jiterp_do_jit_call_indirect } from "./jiterpreter-jit-call";
import { mono_wasm_typed_array_from_ref } from "./net6-legacy/buffers";
import {
Expand Down Expand Up @@ -57,6 +57,7 @@ export function export_linker(): any {

// interp.c and jiterpreter.c
mono_interp_tier_prepare_jiterpreter,
mono_interp_record_interp_entry,
mono_interp_jit_wasm_entry_trampoline,
mono_interp_jit_wasm_jit_call_trampoline,
mono_interp_invoke_wasm_jit_call_trampoline,
Expand Down
75 changes: 57 additions & 18 deletions src/mono/wasm/runtime/jiterpreter-interp-entry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ import cwraps from "./cwraps";
import {
WasmValtype, WasmBuilder, addWasmFunctionPointer,
_now, elapsedTimes, counters, getRawCwrap, importDef,
getWasmFunctionTable, recordFailure
getWasmFunctionTable, recordFailure, getOptions,
JiterpreterOptions, shortNameBase
} from "./jiterpreter-support";

// Controls miscellaneous diagnostic output.
Expand Down Expand Up @@ -45,6 +46,7 @@ let trampImports : Array<[string, string, Function]> | undefined;
let fnTable : WebAssembly.Table;
let jitQueueTimeout = 0;
const jitQueue : TrampolineInfo[] = [];
const infoTable : { [ptr: number] : TrampolineInfo } = {};

/*
const enum WasmReftype {
Expand Down Expand Up @@ -81,6 +83,7 @@ class TrampolineInfo {

defaultImplementation: number;
result: number;
hitCount: number;

constructor (
imethod: number, method: MonoMethod, argumentCount: number, pParamTypes: NativePointer,
Expand Down Expand Up @@ -112,9 +115,37 @@ class TrampolineInfo {
subName = `${this.imethod.toString(16)}_${subName}`;
}
this.traceName = subName;
this.hitCount = 0;
}
}

let mostRecentOptions : JiterpreterOptions | undefined = undefined;

export function mono_interp_record_interp_entry (imethod: number) {
// clear the unbox bit
imethod = imethod & ~0x1;

const info = infoTable[imethod];
// This shouldn't happen but it's not worth crashing over
if (!info)
return;

if (!mostRecentOptions)
mostRecentOptions = getOptions();

info.hitCount++;
if (info.hitCount === mostRecentOptions!.interpEntryFlushThreshold)
flush_wasm_entry_trampoline_jit_queue();
else if (info.hitCount !== mostRecentOptions!.interpEntryHitCount)
return;

jitQueue.push(info);
if (jitQueue.length >= maxJitQueueLength)
flush_wasm_entry_trampoline_jit_queue();
else
ensure_jit_is_scheduled();
}

// returns function pointer
export function mono_interp_jit_wasm_entry_trampoline (
imethod: number, method: MonoMethod, argumentCount: number, pParamTypes: NativePointer,
Expand All @@ -133,19 +164,15 @@ export function mono_interp_jit_wasm_entry_trampoline (
if (!fnTable)
fnTable = getWasmFunctionTable();

jitQueue.push(info);

// We start by creating a function pointer for this interp_entry trampoline, but instead of
// compiling it right away, we make it point to the default implementation for that signature
// This gives us time to wait before jitting it so we can jit multiple trampolines at once.
// Some entry wrappers are also only called a few dozen times, so it's valuable to wait
// until a wrapper is called a lot before wasting time/memory jitting it.
const defaultImplementationFn = fnTable.get(defaultImplementation);
info.result = addWasmFunctionPointer(defaultImplementationFn);

if (jitQueue.length >= maxJitQueueLength)
flush_wasm_entry_trampoline_jit_queue();
else
ensure_jit_is_scheduled();

infoTable[imethod] = info;
return info.result;
}

Expand All @@ -172,11 +199,14 @@ function flush_wasm_entry_trampoline_jit_queue () {
if (jitQueue.length <= 0)
return;

// If the function signature contains types that need stackval_from_data, that'll use
// some constant slots, so make some extra space
const constantSlots = (4 * jitQueue.length) + 1;
let builder = trampBuilder;
if (!builder)
trampBuilder = builder = new WasmBuilder();
trampBuilder = builder = new WasmBuilder(constantSlots);
else
builder.clear();
builder.clear(constantSlots);
const started = _now();
let compileStarted = 0;
let rejected = true, threw = false;
Expand Down Expand Up @@ -239,7 +269,7 @@ function flush_wasm_entry_trampoline_jit_queue () {
// Emit function imports
for (let i = 0; i < trampImports.length; i++) {
mono_assert(trampImports[i], () => `trace #${i} missing`);
const wasmName = compress ? i.toString(16) : undefined;
const wasmName = compress ? i.toString(shortNameBase) : undefined;
builder.defineImportedFunction("i", trampImports[i][0], trampImports[i][1], wasmName);
}

Expand Down Expand Up @@ -275,6 +305,7 @@ function flush_wasm_entry_trampoline_jit_queue () {
builder.beginFunction(info.traceName, {
"sp_args": WasmValtype.i32,
"need_unbox": WasmValtype.i32,
"scratchBuffer": WasmValtype.i32,
});

const ok = generate_wasm_body(builder, info);
Expand All @@ -293,16 +324,17 @@ function flush_wasm_entry_trampoline_jit_queue () {
const traceModule = new WebAssembly.Module(buffer);

const imports : any = {
h: (<any>Module).asm.memory
};
// Place our function imports into the import dictionary
for (let i = 0; i < trampImports.length; i++) {
const wasmName = compress ? i.toString(16) : trampImports[i][0];
const wasmName = compress ? i.toString(36) : trampImports[i][0];
imports[wasmName] = trampImports[i][2];
}

const traceInstance = new WebAssembly.Instance(traceModule, {
i: imports
i: imports,
c: <any>builder.getConstants(),
m: { h: (<any>Module).asm.memory },
});

// Now that we've jitted the trampolines, go through and fix up the function pointers
Expand Down Expand Up @@ -432,7 +464,7 @@ function append_stackval_from_data (

default: {
// Call stackval_from_data to copy the value and get its size
builder.i32_const(<any>type);
builder.ptr_const(type);
// result
builder.local("sp_args");
// value
Expand All @@ -454,6 +486,12 @@ function generate_wasm_body (
) : boolean {
// FIXME: This is not thread-safe, but the alternative of alloca makes the trampoline
// more expensive
// The solution is likely to put the address of the scratch buffer in a global that we provide
// at module instantiation time, so each thread can malloc its own copy of the buffer
// and then pass it in when instantiating instead of compiling the constant into the module
// FIXME: Pre-allocate these buffers and their constant slots at the start before we
// generate function bodies, so that even if we run out of constant slots for MonoType we
// will always have put the buffers in a constant slot. This will be necessary for thread safety
const scratchBuffer = <any>Module._malloc(sizeOfJiterpEntryData);
_zero_region(scratchBuffer, sizeOfJiterpEntryData);

Expand All @@ -479,7 +517,8 @@ function generate_wasm_body (
}

// Populate the scratch buffer containing call data
builder.i32_const(scratchBuffer);
builder.ptr_const(scratchBuffer);
builder.local("scratchBuffer", WasmOpcode.tee_local);

builder.local("rmethod");
// Clear the unbox-this-reference flag if present (see above) so that rmethod is a valid ptr
Expand All @@ -492,7 +531,7 @@ function generate_wasm_body (

// prologue takes data->rmethod and initializes data->context, then returns a value for sp_args
// prologue also performs thread attach
builder.i32_const(scratchBuffer);
builder.local("scratchBuffer");
// prologue takes this_arg so it can handle delegates
if (info.hasThisReference)
builder.local("this_arg");
Expand Down Expand Up @@ -530,7 +569,7 @@ function generate_wasm_body (
append_stackval_from_data(builder, type, `arg${i}`);
}

builder.i32_const(scratchBuffer);
builder.local("scratchBuffer");
builder.local("sp_args");
if (info.hasReturnValue)
builder.local("res");
Expand Down
Loading

0 comments on commit 429cda7

Please sign in to comment.