Use v128 opcodes when unrolling memsets/memmoves

Introduce builder v128_const method v8 doesn't optimize splats so use the enormous encoding for v128 zero Fix fast memset for nonzero values Detect constant shuffle vectors for i2/i4 shuffles and expand them to byte shuffle vectors at JIT time Also optimize i1 shuffles
dotnet · Jun 1, 2023 · 07e8544 · 07e8544
1 parent 3eb4407
commit 07e8544
Show file tree

Hide file tree

Showing 2 changed files with 88 additions and 38 deletions.
diff --git a/src/mono/mono/mini/interp/interp-simd-intrins.def b/src/mono/mono/mini/interp/interp-simd-intrins.def
@@ -115,7 +115,8 @@ INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_AND_NOT, interp_v128_and_
 INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U2_LESS_THAN_EQUAL, interp_v128_u2_less_than_equal, 52)
 
 // wasm only has a swizzle opcode for i8x16, none of the others
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SHUFFLE, interp_v128_i1_shuffle, 14)
+// jiterp has special handling for i1 shuffles to secure a v8 optimization
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SHUFFLE, interp_v128_i1_shuffle, 0)
 INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_SHUFFLE, interp_v128_i2_shuffle, 0)
 INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SHUFFLE, interp_v128_i4_shuffle, 0)
 INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_SHUFFLE, interp_v128_i8_shuffle, 0)

diff --git a/src/mono/wasm/runtime/jiterpreter-trace-generator.ts b/src/mono/wasm/runtime/jiterpreter-trace-generator.ts
@@ -152,9 +152,10 @@ function is_backward_branch_target(
     return false;
 }
 
-const knownConstantValues = new Map<number, number>();
+type KnownConstantValue = number | Uint8Array;
+const knownConstantValues = new Map<number, KnownConstantValue>();
 
-function get_known_constant_value(builder: WasmBuilder, localOffset: number): number | undefined {
+function get_known_constant_value(builder: WasmBuilder, localOffset: number): KnownConstantValue | undefined {
     if (isAddressTaken(builder, localOffset))
         return undefined;
 
@@ -1370,8 +1371,11 @@ export function generateWasmBody(
                 ) {
                     if (!emit_simd(builder, ip, opcode, opname, simdIntrinsArgCount, simdIntrinsIndex))
                         ip = abort;
-                    else
+                    else {
                         containsSimd = true;
+                        // We need to do dreg invalidation differently for simd, especially to handle ldc
+                        skipDregInvalidation = true;
+                    }
                 } else if (opcodeValue === 0) {
                     // This means it was explicitly marked as no-value in the opcode value table
                     //  so we can just skip over it. This is done for things like nops.
@@ -1517,6 +1521,9 @@ function append_stloc_tail(builder: WasmBuilder, offset: number, opcodeOrPrefix:
     const alignment = (simdOpcode !== undefined) || (opcodeOrPrefix > WasmOpcode.f64_store) ? 0 : 2;
     builder.appendMemarg(offset, alignment);
     invalidate_local(offset);
+    // HACK: Invalidate the second stack slot used by a simd vector
+    if (simdOpcode !== undefined)
+        invalidate_local(offset + 8);
 }
 
 // Pass bytesInvalidated=0 if you are reading from the local and the address will never be
@@ -3098,10 +3105,10 @@ function emit_simd(
         case MintOpcode.MINT_SIMD_V128_LDC: {
             if (builder.options.enableSimd && getIsWasmSimdSupported()) {
                 builder.local("pLocals");
-                builder.v128_const(
-                    localHeapViewU8().slice(<any>ip + 4, <any>ip + 4 + sizeOfV128)
-                );
+                const view = localHeapViewU8().slice(<any>ip + 4, <any>ip + 4 + sizeOfV128);
+                builder.v128_const(view);
                 append_simd_store(builder, ip);
+                knownConstantValues.set(getArgU16(ip, 1), view);
             } else {
                 // dest
                 append_ldloca(builder, getArgU16(ip, 1), sizeOfV128);
@@ -3281,11 +3288,34 @@ function emit_simd_3(builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrins
                 builder.appendU8(WasmOpcode.i32_eqz);
             append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store);
             return true;
+        case SimdIntrinsic3.V128_I1_SHUFFLE: {
+            // Detect a constant indices vector and turn it into a const. This allows
+            //  v8 to use a more optimized implementation of the swizzle opcode
+            const indicesOffset = getArgU16(ip, 3),
+                constantIndices = get_known_constant_value(builder, indicesOffset);
+
+            // Pre-load destination ptr
+            builder.local("pLocals");
+            // Load vec
+            append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
+
+            if (typeof (constantIndices) === "object") {
+                // HACK: Use the known constant vector directly instead of loading it from memory.
+                builder.appendSimd(WasmSimdOpcode.v128_const);
+                builder.appendBytes(constantIndices);
+            } else {
+                // Load the indices from memory
+                append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
+            }
+
+            // we now have two vectors on the stack, the values and the byte indices
+            builder.appendSimd(WasmSimdOpcode.i8x16_swizzle);
+            append_simd_store(builder, ip);
+            return true;
+        }
         case SimdIntrinsic3.V128_I2_SHUFFLE:
         case SimdIntrinsic3.V128_I4_SHUFFLE:
             // FIXME: I8
-            // FIXME: Many uses of these shuffles have constant shuffle indices,
-            //  which we could convert into bytes at compile time for vastly improved performance
             return emit_shuffle(builder, ip, index === SimdIntrinsic3.V128_I2_SHUFFLE ? 8 : 4);
         default:
             return false;
@@ -3297,41 +3327,60 @@ function emit_simd_3(builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrins
 // implement i16 and i32 shuffles on top of wasm's only shuffle opcode by expanding the
 //  element shuffle indices into byte indices
 function emit_shuffle(builder: WasmBuilder, ip: MintOpcodePtr, elementCount: number): boolean {
-    const elementSize = 16 / elementCount;
+    const elementSize = 16 / elementCount,
+        indicesOffset = getArgU16(ip, 3),
+        constantIndices = get_known_constant_value(builder, indicesOffset);
     mono_assert((elementSize === 2) || (elementSize === 4), "Unsupported shuffle element size");
+
+    // Pre-load destination ptr
     builder.local("pLocals");
     // Load vec
     append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
-    // Load indices (in chars)
-    append_ldloc(builder, getArgU16(ip, 3), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
-    // There's no direct narrowing opcode for i32 -> i8, so we have to do two steps :(
-    if (elementCount === 4) {
-        // i32{lane0 ... lane3} -> i16{lane0 ... lane3, 0 ...}
+    if (typeof (constantIndices) === "object") {
+        // HACK: We have a known constant shuffle vector with char or int indices. Expand it to
+        //  byte indices and then embed a new constant in the trace.
+        const newShuffleVector = new Uint8Array(sizeOfV128),
+            nativeIndices = (elementSize === 2)
+                ? new Uint16Array(constantIndices.buffer, constantIndices.byteOffset, elementCount)
+                : new Uint32Array(constantIndices.buffer, constantIndices.byteOffset, elementCount);
+        for (let i = 0, k = 0; i < elementCount; i++, k += elementSize) {
+            const elementIndex = nativeIndices[i];
+            for (let j = 0; j < elementSize; j++)
+                newShuffleVector[k + j] = (elementIndex * elementSize) + j;
+        }
+        // console.log(`shuffle w/element size ${elementSize} with constant indices ${nativeIndices} (${constantIndices}) -> byte indices ${newShuffleVector}`);
+        builder.appendSimd(WasmSimdOpcode.v128_const);
+        builder.appendBytes(newShuffleVector);
+    } else {
+        // Load indices (in chars)
+        append_ldloc(builder, indicesOffset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
+        // There's no direct narrowing opcode for i32 -> i8, so we have to do two steps :(
+        if (elementCount === 4) {
+            // i32{lane0 ... lane3} -> i16{lane0 ... lane3, 0 ...}
+            builder.v128_const(0);
+            builder.appendSimd(WasmSimdOpcode.i16x8_narrow_i32x4_u);
+        }
+        // Load a zero vector (narrow takes two vectors)
         builder.v128_const(0);
-        builder.appendSimd(WasmSimdOpcode.i16x8_narrow_i32x4_u);
-    }
-    // Load a zero vector (narrow takes two vectors)
-    builder.v128_const(0);
-    // i16{lane0 ... lane7} -> i8{lane0 ... lane7, 0 ...}
-    builder.appendSimd(WasmSimdOpcode.i8x16_narrow_i16x8_u);
-    // i8{0, 1, 2, 3 ...} -> i8{0, 0, 1, 1, 2, 2, 3, 3 ...}
-    builder.appendSimd(WasmSimdOpcode.v128_const);
-    for (let i = 0; i < elementCount; i++) {
-        for (let j = 0; j < elementSize; j++)
-            builder.appendU8(i);
-    }
-    builder.appendSimd(WasmSimdOpcode.i8x16_swizzle);
-    // multiply indices by 2 to scale from char indices to byte indices
-    builder.i32_const(elementCount === 4 ? 2 : 1);
-    builder.appendSimd(WasmSimdOpcode.i8x16_shl);
-    // now add 1 to the secondary lane of each char
-    builder.appendSimd(WasmSimdOpcode.v128_const);
-    for (let i = 0; i < elementCount; i++) {
-        for (let j = 0; j < elementSize; j++)
-            builder.appendU8(j);
+        // i16{lane0 ... lane7} -> i8{lane0 ... lane7, 0 ...}
+        builder.appendSimd(WasmSimdOpcode.i8x16_narrow_i16x8_u);
+        // i8{0, 1, 2, 3 ...} -> i8{0, 0, 1, 1, 2, 2, 3, 3 ...}
+        builder.appendSimd(WasmSimdOpcode.v128_const);
+        for (let i = 0; i < elementCount; i++) {
+            for (let j = 0; j < elementSize; j++)
+                builder.appendU8(i);
+        }
+        builder.appendSimd(WasmSimdOpcode.i8x16_swizzle);
+        // multiply indices by 2 to scale from char indices to byte indices
+        builder.i32_const(elementCount === 4 ? 2 : 1);
+        builder.appendSimd(WasmSimdOpcode.i8x16_shl);
+        // now add 1 to the secondary lane of each char
+        builder.appendSimd(WasmSimdOpcode.v128_const);
+        for (let i = 0; i < elementCount; i++) {
+            for (let j = 0; j < elementSize; j++)
+                builder.appendU8(j);
+        }
     }
-    // we can do a bitwise or since we know we previously multiplied all the lanes by 2
-    builder.appendSimd(WasmSimdOpcode.v128_or);
     // we now have two vectors on the stack, the values and the byte indices
     builder.appendSimd(WasmSimdOpcode.i8x16_swizzle);
     append_simd_store(builder, ip);