Bug 1656226 - Implement the experimental opcodes. r=jseward

Implement some of the experimental SIMD opcodes that are supported by all of V8, LLVM, and Binaryen, for maximum compatibility with test content we might be exposed to. Most/all of these will probably make it into the spec, as they lead to substantial speedups in some programs, and they are deterministic. For spec and cpu mapping details, see: WebAssembly/simd#122 (pmax/pmin) WebAssembly/simd#232 (rounding) WebAssembly/simd#127 (dot product) WebAssembly/simd#237 (load zero) The wasm bytecode values used here come from the binaryen changes that are linked from those tickets, that's the best documentation right now. Current binaryen opcode mappings are here: https://github.com/WebAssembly/binaryen/blob/master/src/wasm-binary.h Also: Drive-by fix for signatures of vroundss and vroundsd, these are unary operations and should follow the conventions for these with src/dest arguments, not src0/src1/dest. Also: Drive-by fix to add variants of vmovss and vmovsd on x64 that take Operand source and FloatRegister destination. Differential Revision: https://phabricator.services.mozilla.com/D85982
ambroff · Aug 12, 2020 · 9c46265 · 9c46265
1 parent 5b91ef7
commit 9c46265
Show file tree

Hide file tree

Showing 21 changed files with 766 additions and 45 deletions.
diff --git a/js/src/jit-test/lib/wasm-binary.js b/js/src/jit-test/lib/wasm-binary.js
@@ -39,6 +39,7 @@ const I32Code          = 0x7f;
 const I64Code          = 0x7e;
 const F32Code          = 0x7d;
 const F64Code          = 0x7c;
+const V128Code         = 0x7b;
 const AnyFuncCode      = 0x70;
 const AnyrefCode       = 0x6f;
 const OptRefCode       = 0x6c;
@@ -53,6 +54,7 @@ const CallCode         = 0x10;
 const CallIndirectCode = 0x11;
 const DropCode         = 0x1a;
 const SelectCode       = 0x1b;
+const LocalGetCode     = 0x20;
 const I32Load          = 0x28;
 const I64Load          = 0x29;
 const F32Load          = 0x2a;
@@ -102,6 +104,27 @@ const RefNullCode      = 0xd0;
 const RefIsNullCode    = 0xd1;
 const RefFuncCode      = 0xd2;
 
+// SIMD opcodes
+const V128LoadCode = 0x00;
+const V128StoreCode = 0x0b;
+
+// Experimental SIMD opcodes as of August, 2020.
+const I32x4DotSI16x8Code = 0xba;
+const F32x4CeilCode = 0xd8;
+const F32x4FloorCode = 0xd9;
+const F32x4TruncCode = 0xda;
+const F32x4NearestCode = 0xdb;
+const F64x2CeilCode = 0xdc;
+const F64x2FloorCode = 0xdd;
+const F64x2TruncCode = 0xde;
+const F64x2NearestCode = 0xdf;
+const F32x4PMinCode = 0xea;
+const F32x4PMaxCode = 0xeb;
+const F64x2PMinCode = 0xf6;
+const F64x2PMaxCode = 0xf7;
+const V128Load32ZeroCode = 0xfc;
+const V128Load64ZeroCode = 0xfd;
+
 const FirstInvalidOpcode = 0xc5;
 const LastInvalidOpcode = 0xfa;
 const GcPrefix = 0xfb;
@@ -300,8 +323,15 @@ function exportSection(exports) {
     body.push(...varU32(exports.length));
     for (let exp of exports) {
         body.push(...string(exp.name));
-        body.push(...varU32(FunctionCode));
-        body.push(...varU32(exp.funcIndex));
+        if (exp.hasOwnProperty("funcIndex")) {
+            body.push(...varU32(FunctionCode));
+            body.push(...varU32(exp.funcIndex));
+        } else if (exp.hasOwnProperty("memIndex")) {
+            body.push(...varU32(MemoryCode));
+            body.push(...varU32(exp.memIndex));
+        } else {
+            throw "Bad export " + exp;
+        }
     }
     return { name: exportId, body };
 }

diff --git a/js/src/jit-test/tests/wasm/simd/experimental.js b/js/src/jit-test/tests/wasm/simd/experimental.js
@@ -0,0 +1,200 @@
+// Experimental opcodes.  We have no text parsing support for these yet.  The
+// tests will be cleaned up and moved into ad-hack.js if the opcodes are
+// adopted.
+
+// When simd is enabled by default in release builds we will flip the value of
+// SimdExperimentalEnabled to false in RELEASE_OR_BETA builds.  At that point,
+// these tests will start failing in release or beta builds, and a guard
+// asserting !RELEASE_OR_BETA will have to be added above.  That is how it
+// should be.
+
+load(libdir + "wasm-binary.js");
+
+function wasmEval(bytes, imports) {
+    return new WebAssembly.Instance(new WebAssembly.Module(bytes), imports);
+}
+
+function get(arr, loc, len) {
+    let res = [];
+    for ( let i=0; i < len; i++ ) {
+        res.push(arr[loc+i]);
+    }
+    return res;
+}
+
+function set(arr, loc, vals) {
+    for ( let i=0; i < vals.length; i++ ) {
+        if (arr instanceof BigInt64Array) {
+            arr[loc+i] = BigInt(vals[i]);
+        } else {
+            arr[loc+i] = vals[i];
+        }
+    }
+}
+
+function assertSame(got, expected) {
+    assertEq(got.length, expected.length);
+    for ( let i=0; i < got.length; i++ ) {
+        let g = got[i];
+        let e = expected[i];
+        if (typeof g != typeof e) {
+            if (typeof g == "bigint")
+                e = BigInt(e);
+            else if (typeof e == "bigint")
+                g = BigInt(g);
+        }
+        assertEq(g, e);
+    }
+}
+
+function iota(len) {
+    let xs = [];
+    for ( let i=0 ; i < len ; i++ )
+        xs.push(i);
+    return xs;
+}
+
+function pmin(x, y) { return y < x ? y : x }
+function pmax(x, y) { return x < y ? y : x }
+
+function ffloor(x) { return Math.fround(Math.floor(x)) }
+function fceil(x) { return Math.fround(Math.ceil(x)) }
+function ftrunc(x) { return Math.fround(Math.sign(x)*Math.floor(Math.abs(x))) }
+function fnearest(x) { return Math.fround(Math.round(x)) }
+
+function dfloor(x) { return Math.floor(x) }
+function dceil(x) { return Math.ceil(x) }
+function dtrunc(x) { return Math.sign(x)*Math.floor(Math.abs(x)) }
+function dnearest(x) { return Math.round(x) }
+
+const v2vSig = {args:[], ret:VoidCode};
+
+function V128Load(addr) {
+    return [I32ConstCode, varS32(addr),
+            SimdPrefix, V128LoadCode, 4, varU32(0)]
+}
+
+function V128StoreExpr(addr, v) {
+    return [I32ConstCode, varS32(addr),
+            ...v,
+            SimdPrefix, V128StoreCode, 4, varU32(0)];
+}
+
+// Pseudo-min/max, https://github.com/WebAssembly/simd/pull/122
+var fxs = [5, 1, -4, 2];
+var fys = [6, 0, -7, 3];
+var dxs = [5, 1];
+var dys = [6, 0];
+
+for ( let [opcode, xs, ys, operator] of [[F32x4PMinCode, fxs, fys, pmin],
+                                         [F32x4PMaxCode, fxs, fys, pmax],
+                                         [F64x2PMinCode, dxs, dys, pmin],
+                                         [F64x2PMaxCode, dxs, dys, pmax]] ) {
+    var k = xs.length;
+    var ans = iota(k).map((i) => operator(xs[i], ys[i]))
+
+    var ins = wasmEval(moduleWithSections([
+        sigSection([v2vSig]),
+        declSection([0]),
+        memorySection(1),
+        exportSection([{funcIndex: 0, name: "run"},
+                       {memIndex: 0, name: "mem"}]),
+        bodySection([
+            funcBody({locals:[],
+                      body: [...V128StoreExpr(0, [...V128Load(16),
+                                                  ...V128Load(32),
+                                                  SimdPrefix, varU32(opcode)])]})])]));
+
+    var mem = new (k == 4 ? Float32Array : Float64Array)(ins.exports.mem.buffer);
+    set(mem, k, xs);
+    set(mem, 2*k, ys);
+    ins.exports.run();
+    var result = get(mem, 0, k);
+    assertSame(result, ans);
+}
+
+// Widening integer dot product, https://github.com/WebAssembly/simd/pull/127
+
+var ins = wasmEval(moduleWithSections([
+    sigSection([v2vSig]),
+    declSection([0]),
+    memorySection(1),
+    exportSection([{funcIndex: 0, name: "run"},
+                   {memIndex: 0, name: "mem"}]),
+    bodySection([
+        funcBody({locals:[],
+                  body: [...V128StoreExpr(0, [...V128Load(16),
+                                              ...V128Load(32),
+                                              SimdPrefix, varU32(I32x4DotSI16x8Code)])]})])]));
+
+var xs = [5, 1, -4, 2, 20, -15, 12, 3];
+var ys = [6, 0, -7, 3, 8, -1, -3, 7];
+var ans = [xs[0]*ys[0] + xs[1]*ys[1],
+           xs[2]*ys[2] + xs[3]*ys[3],
+           xs[4]*ys[4] + xs[5]*ys[5],
+           xs[6]*ys[6] + xs[7]*ys[7]];
+
+var mem16 = new Int16Array(ins.exports.mem.buffer);
+var mem32 = new Int32Array(ins.exports.mem.buffer);
+set(mem16, 8, xs);
+set(mem16, 16, ys);
+ins.exports.run();
+var result = get(mem32, 0, 4);
+assertSame(result, ans);
+
+// Rounding, https://github.com/WebAssembly/simd/pull/232
+
+var fxs = [5.1, -1.1, -4.3, 0];
+var dxs = [5.1, -1.1];
+
+for ( let [opcode, xs, operator] of [[F32x4CeilCode, fxs, fceil],
+                                     [F32x4FloorCode, fxs, ffloor],
+                                     [F32x4TruncCode, fxs, ftrunc],
+                                     [F32x4NearestCode, fxs, fnearest],
+                                     [F64x2CeilCode, dxs, dceil],
+                                     [F64x2FloorCode, dxs, dfloor],
+                                     [F64x2TruncCode, dxs, dtrunc],
+                                     [F64x2NearestCode, dxs, dnearest]] ) {
+    var k = xs.length;
+    var ans = xs.map(operator);
+
+    var ins = wasmEval(moduleWithSections([
+        sigSection([v2vSig]),
+        declSection([0]),
+        memorySection(1),
+        exportSection([{funcIndex: 0, name: "run"},
+                       {memIndex: 0, name: "mem"}]),
+        bodySection([
+            funcBody({locals:[],
+                      body: [...V128StoreExpr(0, [...V128Load(16),
+                                                  SimdPrefix, varU32(opcode)])]})])]));
+
+    var mem = new (k == 4 ? Float32Array : Float64Array)(ins.exports.mem.buffer);
+    set(mem, k, xs);
+    ins.exports.run();
+    var result = get(mem, 0, k);
+    assertSame(result, ans);
+}
+
+// Zero-extending SIMD load, https://github.com/WebAssembly/simd/pull/237
+
+for ( let [opcode, k, log2align, cons, cast] of [[V128Load32ZeroCode, 4, 2, Int32Array, Number],
+                                                 [V128Load64ZeroCode, 2, 3, BigInt64Array, BigInt]] ) {
+    var ins = wasmEval(moduleWithSections([
+        sigSection([v2vSig]),
+        declSection([0]),
+        memorySection(1),
+        exportSection([{funcIndex: 0, name: "run"},
+                       {memIndex: 0, name: "mem"}]),
+        bodySection([
+            funcBody({locals:[],
+                      body: [...V128StoreExpr(0, [I32ConstCode, varU32(16),
+                                                  SimdPrefix, varU32(opcode), log2align, varU32(0)])]})])]));
+
+    var mem = new cons(ins.exports.mem.buffer);
+    mem[k] = cast(37);
+    ins.exports.run();
+    var result = get(mem, 0, k);
+    assertSame(result, iota(k).map((v) => v == 0 ? 37 : 0));
+}
+
diff --git a/js/src/jit/MacroAssembler.h b/js/src/jit/MacroAssembler.h
@@ -2524,6 +2524,54 @@ class MacroAssembler : public MacroAssemblerSpecific {
   inline void unsignedWidenLowInt32x4(FloatRegister src, FloatRegister dest)
       DEFINED_ON(x86_shared);
 
+  // Compare-based minimum/maximum (experimental as of August, 2020)
+  // https://github.com/WebAssembly/simd/pull/122
+
+  inline void pseudoMinFloat32x4(FloatRegister rhs, FloatRegister lhsDest)
+      DEFINED_ON(x86_shared);
+
+  inline void pseudoMinFloat64x2(FloatRegister rhs, FloatRegister lhsDest)
+      DEFINED_ON(x86_shared);
+
+  inline void pseudoMaxFloat32x4(FloatRegister rhs, FloatRegister lhsDest)
+      DEFINED_ON(x86_shared);
+
+  inline void pseudoMaxFloat64x2(FloatRegister rhs, FloatRegister lhsDest)
+      DEFINED_ON(x86_shared);
+
+  // Widening/pairwise integer dot product (experimental as of August, 2020)
+  // https://github.com/WebAssembly/simd/pull/127
+
+  inline void widenDotInt16x8(FloatRegister rhs, FloatRegister lhsDest)
+      DEFINED_ON(x86_shared);
+
+  // Floating point rounding (experimental as of August, 2020)
+  // https://github.com/WebAssembly/simd/pull/232
+
+  inline void ceilFloat32x4(FloatRegister src, FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
+  inline void ceilFloat64x2(FloatRegister src, FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
+  inline void floorFloat32x4(FloatRegister src, FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
+  inline void floorFloat64x2(FloatRegister src, FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
+  inline void truncFloat32x4(FloatRegister src, FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
+  inline void truncFloat64x2(FloatRegister src, FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
+  inline void nearestFloat32x4(FloatRegister src, FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
+  inline void nearestFloat64x2(FloatRegister src, FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
  public:
   // ========================================================================
   // Truncate floating point.

diff --git a/js/src/jit/arm/MacroAssembler-arm.cpp b/js/src/jit/arm/MacroAssembler-arm.cpp
@@ -6028,6 +6028,7 @@ void MacroAssemblerARM::wasmLoadImpl(const wasm::MemoryAccessDesc& access,
       }
     }
   } else {
+    MOZ_ASSERT(!access.isZeroExtendSimd128Load());
     bool isFloat = output.isFloat();
     if (isFloat) {
       MOZ_ASSERT((byteSize == 4) == output.fpu().isSingle());

diff --git a/js/src/jit/arm64/MacroAssembler-arm64.cpp b/js/src/jit/arm64/MacroAssembler-arm64.cpp
@@ -360,9 +360,11 @@ void MacroAssemblerCompat::wasmLoadImpl(const wasm::MemoryAccessDesc& access,
         Ldr(SelectGPReg(outany, out64), srcAddr);
         break;
       case Scalar::Float32:
+        MOZ_ASSERT(!access.isZeroExtendSimd128Load());
         Ldr(SelectFPReg(outany, out64, 32), srcAddr);
         break;
       case Scalar::Float64:
+        MOZ_ASSERT(!access.isZeroExtendSimd128Load());
         Ldr(SelectFPReg(outany, out64, 64), srcAddr);
         break;
       case Scalar::Uint8Clamped:

diff --git a/js/src/jit/mips-shared/MacroAssembler-mips-shared.cpp b/js/src/jit/mips-shared/MacroAssembler-mips-shared.cpp
@@ -2126,9 +2126,11 @@ void MacroAssemblerMIPSShared::wasmLoadImpl(
       isSigned = false;
       break;
     case Scalar::Float64:
+      MOZ_ASSERT(!access.isZeroExtendSimd128Load());
       isFloat = true;
       break;
     case Scalar::Float32:
+      MOZ_ASSERT(!access.isZeroExtendSimd128Load());
       isFloat = true;
       break;
     default:

diff --git a/js/src/jit/shared/Assembler-shared.h b/js/src/jit/shared/Assembler-shared.h
@@ -492,6 +492,7 @@ class MemoryAccessDesc {
   Scalar::Type type_;
   jit::Synchronization sync_;
   wasm::BytecodeOffset trapOffset_;
+  bool zeroExtendSimd128Load_;
 
  public:
   explicit MemoryAccessDesc(
@@ -502,7 +503,8 @@ class MemoryAccessDesc {
         align_(align),
         type_(type),
         sync_(sync),
-        trapOffset_(trapOffset) {
+        trapOffset_(trapOffset),
+        zeroExtendSimd128Load_(false) {
     MOZ_ASSERT(mozilla::IsPowerOfTwo(align));
   }
 
@@ -513,6 +515,13 @@ class MemoryAccessDesc {
   const jit::Synchronization& sync() const { return sync_; }
   BytecodeOffset trapOffset() const { return trapOffset_; }
   bool isAtomic() const { return !sync_.isNone(); }
+  bool isZeroExtendSimd128Load() const { return zeroExtendSimd128Load_; }
+
+  void setZeroExtendSimd128Load() {
+    MOZ_ASSERT(type() == Scalar::Float32 || type() == Scalar::Float64);
+    MOZ_ASSERT(!isAtomic());
+    zeroExtendSimd128Load_ = true;
+  }
 
   void clearOffset() { offset_ = 0; }
   void setOffset(uint32_t offset) { offset_ = offset; }

diff --git a/js/src/jit/x64/MacroAssembler-x64.cpp b/js/src/jit/x64/MacroAssembler-x64.cpp
@@ -596,10 +596,12 @@ void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
       movl(srcAddr, out.gpr());
       break;
     case Scalar::Float32:
-      loadFloat32(srcAddr, out.fpu());
+      // vmovss does the right thing also for access.isZeroExtendSimdLoad()
+      vmovss(srcAddr, out.fpu());
       break;
     case Scalar::Float64:
-      loadDouble(srcAddr, out.fpu());
+      // vmovsd does the right thing also for access.isZeroExtendSimdLoad()
+      vmovsd(srcAddr, out.fpu());
       break;
     case Scalar::Simd128:
       MacroAssemblerX64::loadUnalignedSimd128(srcAddr, out.fpu());