Skip to content

Commit

Permalink
Use v128 opcodes when unrolling memsets/memmoves
Browse files Browse the repository at this point in the history
Introduce builder v128_const method
v8 doesn't optimize splats so use the enormous encoding for v128 zero
Fix fast memset for nonzero values
Detect constant shuffle vectors for i2/i4 shuffles and expand them to byte shuffle vectors at JIT time
Also optimize i1 shuffles
  • Loading branch information
kg committed Jun 1, 2023
1 parent 3eb4407 commit 07e8544
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 38 deletions.
3 changes: 2 additions & 1 deletion src/mono/mono/mini/interp/interp-simd-intrins.def
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_AND_NOT, interp_v128_and_
INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U2_LESS_THAN_EQUAL, interp_v128_u2_less_than_equal, 52)

// wasm only has a swizzle opcode for i8x16, none of the others
INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SHUFFLE, interp_v128_i1_shuffle, 14)
// jiterp has special handling for i1 shuffles to secure a v8 optimization
INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SHUFFLE, interp_v128_i1_shuffle, 0)
INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_SHUFFLE, interp_v128_i2_shuffle, 0)
INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SHUFFLE, interp_v128_i4_shuffle, 0)
INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_SHUFFLE, interp_v128_i8_shuffle, 0)
Expand Down
123 changes: 86 additions & 37 deletions src/mono/wasm/runtime/jiterpreter-trace-generator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,10 @@ function is_backward_branch_target(
return false;
}

const knownConstantValues = new Map<number, number>();
type KnownConstantValue = number | Uint8Array;
const knownConstantValues = new Map<number, KnownConstantValue>();

function get_known_constant_value(builder: WasmBuilder, localOffset: number): number | undefined {
function get_known_constant_value(builder: WasmBuilder, localOffset: number): KnownConstantValue | undefined {
if (isAddressTaken(builder, localOffset))
return undefined;

Expand Down Expand Up @@ -1370,8 +1371,11 @@ export function generateWasmBody(
) {
if (!emit_simd(builder, ip, opcode, opname, simdIntrinsArgCount, simdIntrinsIndex))
ip = abort;
else
else {
containsSimd = true;
// We need to do dreg invalidation differently for simd, especially to handle ldc
skipDregInvalidation = true;
}
} else if (opcodeValue === 0) {
// This means it was explicitly marked as no-value in the opcode value table
// so we can just skip over it. This is done for things like nops.
Expand Down Expand Up @@ -1517,6 +1521,9 @@ function append_stloc_tail(builder: WasmBuilder, offset: number, opcodeOrPrefix:
const alignment = (simdOpcode !== undefined) || (opcodeOrPrefix > WasmOpcode.f64_store) ? 0 : 2;
builder.appendMemarg(offset, alignment);
invalidate_local(offset);
// HACK: Invalidate the second stack slot used by a simd vector
if (simdOpcode !== undefined)
invalidate_local(offset + 8);
}

// Pass bytesInvalidated=0 if you are reading from the local and the address will never be
Expand Down Expand Up @@ -3098,10 +3105,10 @@ function emit_simd(
case MintOpcode.MINT_SIMD_V128_LDC: {
if (builder.options.enableSimd && getIsWasmSimdSupported()) {
builder.local("pLocals");
builder.v128_const(
localHeapViewU8().slice(<any>ip + 4, <any>ip + 4 + sizeOfV128)
);
const view = localHeapViewU8().slice(<any>ip + 4, <any>ip + 4 + sizeOfV128);
builder.v128_const(view);
append_simd_store(builder, ip);
knownConstantValues.set(getArgU16(ip, 1), view);
} else {
// dest
append_ldloca(builder, getArgU16(ip, 1), sizeOfV128);
Expand Down Expand Up @@ -3281,11 +3288,34 @@ function emit_simd_3(builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrins
builder.appendU8(WasmOpcode.i32_eqz);
append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store);
return true;
case SimdIntrinsic3.V128_I1_SHUFFLE: {
// Detect a constant indices vector and turn it into a const. This allows
// v8 to use a more optimized implementation of the swizzle opcode
const indicesOffset = getArgU16(ip, 3),
constantIndices = get_known_constant_value(builder, indicesOffset);

// Pre-load destination ptr
builder.local("pLocals");
// Load vec
append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);

if (typeof (constantIndices) === "object") {
// HACK: Use the known constant vector directly instead of loading it from memory.
builder.appendSimd(WasmSimdOpcode.v128_const);
builder.appendBytes(constantIndices);
} else {
// Load the indices from memory
append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
}

// we now have two vectors on the stack, the values and the byte indices
builder.appendSimd(WasmSimdOpcode.i8x16_swizzle);
append_simd_store(builder, ip);
return true;
}
case SimdIntrinsic3.V128_I2_SHUFFLE:
case SimdIntrinsic3.V128_I4_SHUFFLE:
// FIXME: I8
// FIXME: Many uses of these shuffles have constant shuffle indices,
// which we could convert into bytes at compile time for vastly improved performance
return emit_shuffle(builder, ip, index === SimdIntrinsic3.V128_I2_SHUFFLE ? 8 : 4);
default:
return false;
Expand All @@ -3297,41 +3327,60 @@ function emit_simd_3(builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrins
// implement i16 and i32 shuffles on top of wasm's only shuffle opcode by expanding the
// element shuffle indices into byte indices
function emit_shuffle(builder: WasmBuilder, ip: MintOpcodePtr, elementCount: number): boolean {
const elementSize = 16 / elementCount;
const elementSize = 16 / elementCount,
indicesOffset = getArgU16(ip, 3),
constantIndices = get_known_constant_value(builder, indicesOffset);
mono_assert((elementSize === 2) || (elementSize === 4), "Unsupported shuffle element size");

// Pre-load destination ptr
builder.local("pLocals");
// Load vec
append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
// Load indices (in chars)
append_ldloc(builder, getArgU16(ip, 3), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
// There's no direct narrowing opcode for i32 -> i8, so we have to do two steps :(
if (elementCount === 4) {
// i32{lane0 ... lane3} -> i16{lane0 ... lane3, 0 ...}
if (typeof (constantIndices) === "object") {
// HACK: We have a known constant shuffle vector with char or int indices. Expand it to
// byte indices and then embed a new constant in the trace.
const newShuffleVector = new Uint8Array(sizeOfV128),
nativeIndices = (elementSize === 2)
? new Uint16Array(constantIndices.buffer, constantIndices.byteOffset, elementCount)
: new Uint32Array(constantIndices.buffer, constantIndices.byteOffset, elementCount);
for (let i = 0, k = 0; i < elementCount; i++, k += elementSize) {
const elementIndex = nativeIndices[i];
for (let j = 0; j < elementSize; j++)
newShuffleVector[k + j] = (elementIndex * elementSize) + j;
}
// console.log(`shuffle w/element size ${elementSize} with constant indices ${nativeIndices} (${constantIndices}) -> byte indices ${newShuffleVector}`);
builder.appendSimd(WasmSimdOpcode.v128_const);
builder.appendBytes(newShuffleVector);
} else {
// Load indices (in chars)
append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
// There's no direct narrowing opcode for i32 -> i8, so we have to do two steps :(
if (elementCount === 4) {
// i32{lane0 ... lane3} -> i16{lane0 ... lane3, 0 ...}
builder.v128_const(0);
builder.appendSimd(WasmSimdOpcode.i16x8_narrow_i32x4_u);
}
// Load a zero vector (narrow takes two vectors)
builder.v128_const(0);
builder.appendSimd(WasmSimdOpcode.i16x8_narrow_i32x4_u);
}
// Load a zero vector (narrow takes two vectors)
builder.v128_const(0);
// i16{lane0 ... lane7} -> i8{lane0 ... lane7, 0 ...}
builder.appendSimd(WasmSimdOpcode.i8x16_narrow_i16x8_u);
// i8{0, 1, 2, 3 ...} -> i8{0, 0, 1, 1, 2, 2, 3, 3 ...}
builder.appendSimd(WasmSimdOpcode.v128_const);
for (let i = 0; i < elementCount; i++) {
for (let j = 0; j < elementSize; j++)
builder.appendU8(i);
}
builder.appendSimd(WasmSimdOpcode.i8x16_swizzle);
// multiply indices by 2 to scale from char indices to byte indices
builder.i32_const(elementCount === 4 ? 2 : 1);
builder.appendSimd(WasmSimdOpcode.i8x16_shl);
// now add 1 to the secondary lane of each char
builder.appendSimd(WasmSimdOpcode.v128_const);
for (let i = 0; i < elementCount; i++) {
for (let j = 0; j < elementSize; j++)
builder.appendU8(j);
// i16{lane0 ... lane7} -> i8{lane0 ... lane7, 0 ...}
builder.appendSimd(WasmSimdOpcode.i8x16_narrow_i16x8_u);
// i8{0, 1, 2, 3 ...} -> i8{0, 0, 1, 1, 2, 2, 3, 3 ...}
builder.appendSimd(WasmSimdOpcode.v128_const);
for (let i = 0; i < elementCount; i++) {
for (let j = 0; j < elementSize; j++)
builder.appendU8(i);
}
builder.appendSimd(WasmSimdOpcode.i8x16_swizzle);
// multiply indices by 2 to scale from char indices to byte indices
builder.i32_const(elementCount === 4 ? 2 : 1);
builder.appendSimd(WasmSimdOpcode.i8x16_shl);
// now add 1 to the secondary lane of each char
builder.appendSimd(WasmSimdOpcode.v128_const);
for (let i = 0; i < elementCount; i++) {
for (let j = 0; j < elementSize; j++)
builder.appendU8(j);
}
}
// we can do a bitwise or since we know we previously multiplied all the lanes by 2
builder.appendSimd(WasmSimdOpcode.v128_or);
// we now have two vectors on the stack, the values and the byte indices
builder.appendSimd(WasmSimdOpcode.i8x16_swizzle);
append_simd_store(builder, ip);
Expand Down

0 comments on commit 07e8544

Please sign in to comment.