From 807022887f155153d398f9f6f94ef59e214489a2 Mon Sep 17 00:00:00 2001
From: Lars T Hansen <lhansen@mozilla.com>
Date: Wed, 12 Aug 2020 07:50:00 +0000
Subject: [PATCH] Bug 1656226 - Implement the experimental opcodes.  r=jseward

Implement some of the experimental SIMD opcodes that are supported by
all of V8, LLVM, and Binaryen, for maximum compatibility with test
content we might be exposed to.  Most/all of these will probably make
it into the spec, as they lead to substantial speedups in some
programs, and they are deterministic.

For spec and cpu mapping details, see:

https://github.com/WebAssembly/simd/pull/122 (pmax/pmin)
https://github.com/WebAssembly/simd/pull/232 (rounding)
https://github.com/WebAssembly/simd/pull/127 (dot product)
https://github.com/WebAssembly/simd/pull/237 (load zero)

The wasm bytecode values used here come from the binaryen changes that
are linked from those tickets, that's the best documentation right
now.  Current binaryen opcode mappings are here:
https://github.com/WebAssembly/binaryen/blob/master/src/wasm-binary.h

Also: Drive-by fix for signatures of vroundss and vroundsd, these are
unary operations and should follow the conventions for these with
src/dest arguments, not src0/src1/dest.

Also: Drive-by fix to add variants of vmovss and vmovsd on x64 that
take Operand source and FloatRegister destination.

Differential Revision: https://phabricator.services.mozilla.com/D85982
---
 js/src/jit-test/lib/wasm-binary.js            |  34 ++-
 .../jit-test/tests/wasm/simd/experimental.js  | 200 ++++++++++++++++++
 js/src/jit/MacroAssembler.h                   |  48 +++++
 js/src/jit/arm/MacroAssembler-arm.cpp         |   1 +
 js/src/jit/arm64/MacroAssembler-arm64.cpp     |   2 +
 .../MacroAssembler-mips-shared.cpp            |   2 +
 js/src/jit/shared/Assembler-shared.h          |  11 +-
 js/src/jit/x64/MacroAssembler-x64.cpp         |   6 +-
 js/src/jit/x86-shared/Assembler-x86-shared.h  |  75 ++++++-
 .../jit/x86-shared/BaseAssembler-x86-shared.h |  23 +-
 .../x86-shared/CodeGenerator-x86-shared.cpp   |  45 +++-
 js/src/jit/x86-shared/Constants-x86-shared.h  |  13 +-
 js/src/jit/x86-shared/Encoding-x86-shared.h   |   3 +
 .../MacroAssembler-x86-shared-inl.h           |  64 +++++-
 .../x86-shared/MacroAssembler-x86-shared.cpp  |  12 +-
 js/src/jit/x86/MacroAssembler-x86.cpp         |   2 +
 js/src/wasm/WasmBaselineCompile.cpp           | 124 ++++++++++-
 js/src/wasm/WasmConstants.h                   |  40 ++--
 js/src/wasm/WasmIonCompile.cpp                |  54 +++++
 js/src/wasm/WasmOpIter.cpp                    |  15 ++
 js/src/wasm/WasmValidate.cpp                  |  37 ++++
 21 files changed, 766 insertions(+), 45 deletions(-)
 create mode 100644 js/src/jit-test/tests/wasm/simd/experimental.js

diff --git a/js/src/jit-test/lib/wasm-binary.js b/js/src/jit-test/lib/wasm-binary.js
index d8eceafe13c4f..81b5d2a68165e 100644
--- a/js/src/jit-test/lib/wasm-binary.js
+++ b/js/src/jit-test/lib/wasm-binary.js
@@ -39,6 +39,7 @@ const I32Code          = 0x7f;
 const I64Code          = 0x7e;
 const F32Code          = 0x7d;
 const F64Code          = 0x7c;
+const V128Code         = 0x7b;
 const AnyFuncCode      = 0x70;
 const AnyrefCode       = 0x6f;
 const OptRefCode       = 0x6c;
@@ -53,6 +54,7 @@ const CallCode         = 0x10;
 const CallIndirectCode = 0x11;
 const DropCode         = 0x1a;
 const SelectCode       = 0x1b;
+const LocalGetCode     = 0x20;
 const I32Load          = 0x28;
 const I64Load          = 0x29;
 const F32Load          = 0x2a;
@@ -102,6 +104,27 @@ const RefNullCode      = 0xd0;
 const RefIsNullCode    = 0xd1;
 const RefFuncCode      = 0xd2;
 
+// SIMD opcodes
+const V128LoadCode = 0x00;
+const V128StoreCode = 0x0b;
+
+// Experimental SIMD opcodes as of August, 2020.
+const I32x4DotSI16x8Code = 0xba;
+const F32x4CeilCode = 0xd8;
+const F32x4FloorCode = 0xd9;
+const F32x4TruncCode = 0xda;
+const F32x4NearestCode = 0xdb;
+const F64x2CeilCode = 0xdc;
+const F64x2FloorCode = 0xdd;
+const F64x2TruncCode = 0xde;
+const F64x2NearestCode = 0xdf;
+const F32x4PMinCode = 0xea;
+const F32x4PMaxCode = 0xeb;
+const F64x2PMinCode = 0xf6;
+const F64x2PMaxCode = 0xf7;
+const V128Load32ZeroCode = 0xfc;
+const V128Load64ZeroCode = 0xfd;
+
 const FirstInvalidOpcode = 0xc5;
 const LastInvalidOpcode = 0xfa;
 const GcPrefix = 0xfb;
@@ -300,8 +323,15 @@ function exportSection(exports) {
     body.push(...varU32(exports.length));
     for (let exp of exports) {
         body.push(...string(exp.name));
-        body.push(...varU32(FunctionCode));
-        body.push(...varU32(exp.funcIndex));
+        if (exp.hasOwnProperty("funcIndex")) {
+            body.push(...varU32(FunctionCode));
+            body.push(...varU32(exp.funcIndex));
+        } else if (exp.hasOwnProperty("memIndex")) {
+            body.push(...varU32(MemoryCode));
+            body.push(...varU32(exp.memIndex));
+        } else {
+            throw "Bad export " + exp;
+        }
     }
     return { name: exportId, body };
 }
diff --git a/js/src/jit-test/tests/wasm/simd/experimental.js b/js/src/jit-test/tests/wasm/simd/experimental.js
new file mode 100644
index 0000000000000..353cd9fb3a579
--- /dev/null
+++ b/js/src/jit-test/tests/wasm/simd/experimental.js
@@ -0,0 +1,200 @@
+// Experimental opcodes.  We have no text parsing support for these yet.  The
+// tests will be cleaned up and moved into ad-hack.js if the opcodes are
+// adopted.
+
+// When simd is enabled by default in release builds we will flip the value of
+// SimdExperimentalEnabled to false in RELEASE_OR_BETA builds.  At that point,
+// these tests will start failing in release or beta builds, and a guard
+// asserting !RELEASE_OR_BETA will have to be added above.  That is how it
+// should be.
+
+load(libdir + "wasm-binary.js");
+
+function wasmEval(bytes, imports) {
+    return new WebAssembly.Instance(new WebAssembly.Module(bytes), imports);
+}
+
+function get(arr, loc, len) {
+    let res = [];
+    for ( let i=0; i < len; i++ ) {
+        res.push(arr[loc+i]);
+    }
+    return res;
+}
+
+function set(arr, loc, vals) {
+    for ( let i=0; i < vals.length; i++ ) {
+        if (arr instanceof BigInt64Array) {
+            arr[loc+i] = BigInt(vals[i]);
+        } else {
+            arr[loc+i] = vals[i];
+        }
+    }
+}
+
+function assertSame(got, expected) {
+    assertEq(got.length, expected.length);
+    for ( let i=0; i < got.length; i++ ) {
+        let g = got[i];
+        let e = expected[i];
+        if (typeof g != typeof e) {
+            if (typeof g == "bigint")
+                e = BigInt(e);
+            else if (typeof e == "bigint")
+                g = BigInt(g);
+        }
+        assertEq(g, e);
+    }
+}
+
+function iota(len) {
+    let xs = [];
+    for ( let i=0 ; i < len ; i++ )
+        xs.push(i);
+    return xs;
+}
+
+function pmin(x, y) { return y < x ? y : x }
+function pmax(x, y) { return x < y ? y : x }
+
+function ffloor(x) { return Math.fround(Math.floor(x)) }
+function fceil(x) { return Math.fround(Math.ceil(x)) }
+function ftrunc(x) { return Math.fround(Math.sign(x)*Math.floor(Math.abs(x))) }
+function fnearest(x) { return Math.fround(Math.round(x)) }
+
+function dfloor(x) { return Math.floor(x) }
+function dceil(x) { return Math.ceil(x) }
+function dtrunc(x) { return Math.sign(x)*Math.floor(Math.abs(x)) }
+function dnearest(x) { return Math.round(x) }
+
+const v2vSig = {args:[], ret:VoidCode};
+
+function V128Load(addr) {
+    return [I32ConstCode, varS32(addr),
+            SimdPrefix, V128LoadCode, 4, varU32(0)]
+}
+
+function V128StoreExpr(addr, v) {
+    return [I32ConstCode, varS32(addr),
+            ...v,
+            SimdPrefix, V128StoreCode, 4, varU32(0)];
+}
+
+// Pseudo-min/max, https://github.com/WebAssembly/simd/pull/122
+var fxs = [5, 1, -4, 2];
+var fys = [6, 0, -7, 3];
+var dxs = [5, 1];
+var dys = [6, 0];
+
+for ( let [opcode, xs, ys, operator] of [[F32x4PMinCode, fxs, fys, pmin],
+                                         [F32x4PMaxCode, fxs, fys, pmax],
+                                         [F64x2PMinCode, dxs, dys, pmin],
+                                         [F64x2PMaxCode, dxs, dys, pmax]] ) {
+    var k = xs.length;
+    var ans = iota(k).map((i) => operator(xs[i], ys[i]))
+
+    var ins = wasmEval(moduleWithSections([
+        sigSection([v2vSig]),
+        declSection([0]),
+        memorySection(1),
+        exportSection([{funcIndex: 0, name: "run"},
+                       {memIndex: 0, name: "mem"}]),
+        bodySection([
+            funcBody({locals:[],
+                      body: [...V128StoreExpr(0, [...V128Load(16),
+                                                  ...V128Load(32),
+                                                  SimdPrefix, varU32(opcode)])]})])]));
+
+    var mem = new (k == 4 ? Float32Array : Float64Array)(ins.exports.mem.buffer);
+    set(mem, k, xs);
+    set(mem, 2*k, ys);
+    ins.exports.run();
+    var result = get(mem, 0, k);
+    assertSame(result, ans);
+}
+
+// Widening integer dot product, https://github.com/WebAssembly/simd/pull/127
+
+var ins = wasmEval(moduleWithSections([
+    sigSection([v2vSig]),
+    declSection([0]),
+    memorySection(1),
+    exportSection([{funcIndex: 0, name: "run"},
+                   {memIndex: 0, name: "mem"}]),
+    bodySection([
+        funcBody({locals:[],
+                  body: [...V128StoreExpr(0, [...V128Load(16),
+                                              ...V128Load(32),
+                                              SimdPrefix, varU32(I32x4DotSI16x8Code)])]})])]));
+
+var xs = [5, 1, -4, 2, 20, -15, 12, 3];
+var ys = [6, 0, -7, 3, 8, -1, -3, 7];
+var ans = [xs[0]*ys[0] + xs[1]*ys[1],
+           xs[2]*ys[2] + xs[3]*ys[3],
+           xs[4]*ys[4] + xs[5]*ys[5],
+           xs[6]*ys[6] + xs[7]*ys[7]];
+
+var mem16 = new Int16Array(ins.exports.mem.buffer);
+var mem32 = new Int32Array(ins.exports.mem.buffer);
+set(mem16, 8, xs);
+set(mem16, 16, ys);
+ins.exports.run();
+var result = get(mem32, 0, 4);
+assertSame(result, ans);
+
+// Rounding, https://github.com/WebAssembly/simd/pull/232
+
+var fxs = [5.1, -1.1, -4.3, 0];
+var dxs = [5.1, -1.1];
+
+for ( let [opcode, xs, operator] of [[F32x4CeilCode, fxs, fceil],
+                                     [F32x4FloorCode, fxs, ffloor],
+                                     [F32x4TruncCode, fxs, ftrunc],
+                                     [F32x4NearestCode, fxs, fnearest],
+                                     [F64x2CeilCode, dxs, dceil],
+                                     [F64x2FloorCode, dxs, dfloor],
+                                     [F64x2TruncCode, dxs, dtrunc],
+                                     [F64x2NearestCode, dxs, dnearest]] ) {
+    var k = xs.length;
+    var ans = xs.map(operator);
+
+    var ins = wasmEval(moduleWithSections([
+        sigSection([v2vSig]),
+        declSection([0]),
+        memorySection(1),
+        exportSection([{funcIndex: 0, name: "run"},
+                       {memIndex: 0, name: "mem"}]),
+        bodySection([
+            funcBody({locals:[],
+                      body: [...V128StoreExpr(0, [...V128Load(16),
+                                                  SimdPrefix, varU32(opcode)])]})])]));
+
+    var mem = new (k == 4 ? Float32Array : Float64Array)(ins.exports.mem.buffer);
+    set(mem, k, xs);
+    ins.exports.run();
+    var result = get(mem, 0, k);
+    assertSame(result, ans);
+}
+
+// Zero-extending SIMD load, https://github.com/WebAssembly/simd/pull/237
+
+for ( let [opcode, k, log2align, cons, cast] of [[V128Load32ZeroCode, 4, 2, Int32Array, Number],
+                                                 [V128Load64ZeroCode, 2, 3, BigInt64Array, BigInt]] ) {
+    var ins = wasmEval(moduleWithSections([
+        sigSection([v2vSig]),
+        declSection([0]),
+        memorySection(1),
+        exportSection([{funcIndex: 0, name: "run"},
+                       {memIndex: 0, name: "mem"}]),
+        bodySection([
+            funcBody({locals:[],
+                      body: [...V128StoreExpr(0, [I32ConstCode, varU32(16),
+                                                  SimdPrefix, varU32(opcode), log2align, varU32(0)])]})])]));
+
+    var mem = new cons(ins.exports.mem.buffer);
+    mem[k] = cast(37);
+    ins.exports.run();
+    var result = get(mem, 0, k);
+    assertSame(result, iota(k).map((v) => v == 0 ? 37 : 0));
+}
+
diff --git a/js/src/jit/MacroAssembler.h b/js/src/jit/MacroAssembler.h
index 2bf177b302c81..7f7aac20118cd 100644
--- a/js/src/jit/MacroAssembler.h
+++ b/js/src/jit/MacroAssembler.h
@@ -2524,6 +2524,54 @@ class MacroAssembler : public MacroAssemblerSpecific {
   inline void unsignedWidenLowInt32x4(FloatRegister src, FloatRegister dest)
       DEFINED_ON(x86_shared);
 
+  // Compare-based minimum/maximum (experimental as of August, 2020)
+  // https://github.com/WebAssembly/simd/pull/122
+
+  inline void pseudoMinFloat32x4(FloatRegister rhs, FloatRegister lhsDest)
+      DEFINED_ON(x86_shared);
+
+  inline void pseudoMinFloat64x2(FloatRegister rhs, FloatRegister lhsDest)
+      DEFINED_ON(x86_shared);
+
+  inline void pseudoMaxFloat32x4(FloatRegister rhs, FloatRegister lhsDest)
+      DEFINED_ON(x86_shared);
+
+  inline void pseudoMaxFloat64x2(FloatRegister rhs, FloatRegister lhsDest)
+      DEFINED_ON(x86_shared);
+
+  // Widening/pairwise integer dot product (experimental as of August, 2020)
+  // https://github.com/WebAssembly/simd/pull/127
+
+  inline void widenDotInt16x8(FloatRegister rhs, FloatRegister lhsDest)
+      DEFINED_ON(x86_shared);
+
+  // Floating point rounding (experimental as of August, 2020)
+  // https://github.com/WebAssembly/simd/pull/232
+
+  inline void ceilFloat32x4(FloatRegister src, FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
+  inline void ceilFloat64x2(FloatRegister src, FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
+  inline void floorFloat32x4(FloatRegister src, FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
+  inline void floorFloat64x2(FloatRegister src, FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
+  inline void truncFloat32x4(FloatRegister src, FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
+  inline void truncFloat64x2(FloatRegister src, FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
+  inline void nearestFloat32x4(FloatRegister src, FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
+  inline void nearestFloat64x2(FloatRegister src, FloatRegister dest)
+      DEFINED_ON(x86_shared);
+
  public:
   // ========================================================================
   // Truncate floating point.
diff --git a/js/src/jit/arm/MacroAssembler-arm.cpp b/js/src/jit/arm/MacroAssembler-arm.cpp
index 58e108663494e..f78080baaf9e9 100644
--- a/js/src/jit/arm/MacroAssembler-arm.cpp
+++ b/js/src/jit/arm/MacroAssembler-arm.cpp
@@ -6028,6 +6028,7 @@ void MacroAssemblerARM::wasmLoadImpl(const wasm::MemoryAccessDesc& access,
       }
     }
   } else {
+    MOZ_ASSERT(!access.isZeroExtendSimd128Load());
     bool isFloat = output.isFloat();
     if (isFloat) {
       MOZ_ASSERT((byteSize == 4) == output.fpu().isSingle());
diff --git a/js/src/jit/arm64/MacroAssembler-arm64.cpp b/js/src/jit/arm64/MacroAssembler-arm64.cpp
index dc966ea45b697..8366029f6d568 100644
--- a/js/src/jit/arm64/MacroAssembler-arm64.cpp
+++ b/js/src/jit/arm64/MacroAssembler-arm64.cpp
@@ -360,9 +360,11 @@ void MacroAssemblerCompat::wasmLoadImpl(const wasm::MemoryAccessDesc& access,
         Ldr(SelectGPReg(outany, out64), srcAddr);
         break;
       case Scalar::Float32:
+        MOZ_ASSERT(!access.isZeroExtendSimd128Load());
         Ldr(SelectFPReg(outany, out64, 32), srcAddr);
         break;
       case Scalar::Float64:
+        MOZ_ASSERT(!access.isZeroExtendSimd128Load());
         Ldr(SelectFPReg(outany, out64, 64), srcAddr);
         break;
       case Scalar::Uint8Clamped:
diff --git a/js/src/jit/mips-shared/MacroAssembler-mips-shared.cpp b/js/src/jit/mips-shared/MacroAssembler-mips-shared.cpp
index c01fbf72cf03e..55d8cad648e90 100644
--- a/js/src/jit/mips-shared/MacroAssembler-mips-shared.cpp
+++ b/js/src/jit/mips-shared/MacroAssembler-mips-shared.cpp
@@ -2126,9 +2126,11 @@ void MacroAssemblerMIPSShared::wasmLoadImpl(
       isSigned = false;
       break;
     case Scalar::Float64:
+      MOZ_ASSERT(!access.isZeroExtendSimd128Load());
       isFloat = true;
       break;
     case Scalar::Float32:
+      MOZ_ASSERT(!access.isZeroExtendSimd128Load());
       isFloat = true;
       break;
     default:
diff --git a/js/src/jit/shared/Assembler-shared.h b/js/src/jit/shared/Assembler-shared.h
index 9111b4a0734f1..f2c9851a3f76d 100644
--- a/js/src/jit/shared/Assembler-shared.h
+++ b/js/src/jit/shared/Assembler-shared.h
@@ -492,6 +492,7 @@ class MemoryAccessDesc {
   Scalar::Type type_;
   jit::Synchronization sync_;
   wasm::BytecodeOffset trapOffset_;
+  bool zeroExtendSimd128Load_;
 
  public:
   explicit MemoryAccessDesc(
@@ -502,7 +503,8 @@ class MemoryAccessDesc {
         align_(align),
         type_(type),
         sync_(sync),
-        trapOffset_(trapOffset) {
+        trapOffset_(trapOffset),
+        zeroExtendSimd128Load_(false) {
     MOZ_ASSERT(mozilla::IsPowerOfTwo(align));
   }
 
@@ -513,6 +515,13 @@ class MemoryAccessDesc {
   const jit::Synchronization& sync() const { return sync_; }
   BytecodeOffset trapOffset() const { return trapOffset_; }
   bool isAtomic() const { return !sync_.isNone(); }
+  bool isZeroExtendSimd128Load() const { return zeroExtendSimd128Load_; }
+
+  void setZeroExtendSimd128Load() {
+    MOZ_ASSERT(type() == Scalar::Float32 || type() == Scalar::Float64);
+    MOZ_ASSERT(!isAtomic());
+    zeroExtendSimd128Load_ = true;
+  }
 
   void clearOffset() { offset_ = 0; }
   void setOffset(uint32_t offset) { offset_ = offset; }
diff --git a/js/src/jit/x64/MacroAssembler-x64.cpp b/js/src/jit/x64/MacroAssembler-x64.cpp
index 3a485ec4ff677..617497785e7bf 100644
--- a/js/src/jit/x64/MacroAssembler-x64.cpp
+++ b/js/src/jit/x64/MacroAssembler-x64.cpp
@@ -596,10 +596,12 @@ void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
       movl(srcAddr, out.gpr());
       break;
     case Scalar::Float32:
-      loadFloat32(srcAddr, out.fpu());
+      // vmovss does the right thing also for access.isZeroExtendSimdLoad()
+      vmovss(srcAddr, out.fpu());
       break;
     case Scalar::Float64:
-      loadDouble(srcAddr, out.fpu());
+      // vmovsd does the right thing also for access.isZeroExtendSimdLoad()
+      vmovsd(srcAddr, out.fpu());
       break;
     case Scalar::Simd128:
       MacroAssemblerX64::loadUnalignedSimd128(srcAddr, out.fpu());
diff --git a/js/src/jit/x86-shared/Assembler-x86-shared.h b/js/src/jit/x86-shared/Assembler-x86-shared.h
index 4401f099fea9e..1ab9213c57ed7 100644
--- a/js/src/jit/x86-shared/Assembler-x86-shared.h
+++ b/js/src/jit/x86-shared/Assembler-x86-shared.h
@@ -328,6 +328,13 @@ class AssemblerX86Shared : public AssemblerShared {
     NoParity = X86Encoding::ConditionNP
   };
 
+  enum class SSERoundingMode {
+    Nearest = int(X86Encoding::SSERoundingMode::RoundToNearest),
+    Floor = int(X86Encoding::SSERoundingMode::RoundDown),
+    Ceil = int(X86Encoding::SSERoundingMode::RoundUp),
+    Trunc = int(X86Encoding::SSERoundingMode::RoundToZero)
+  };
+
   // If this bit is set, the vucomisd operands have to be inverted.
   static const int DoubleConditionBitInvert = 0x10;
 
@@ -631,6 +638,18 @@ class AssemblerX86Shared : public AssemblerShared {
     masm.vmovsd_mr(src.offset, src.base.encoding(), src.index.encoding(),
                    src.scale, dest.encoding());
   }
+  void vmovsd(const Operand& src, FloatRegister dest) {
+    switch (src.kind()) {
+      case Operand::MEM_REG_DISP:
+        vmovsd(src.toAddress(), dest);
+        break;
+      case Operand::MEM_SCALE:
+        vmovsd(src.toBaseIndex(), dest);
+        break;
+      default:
+        MOZ_CRASH("Unknown operand for vmovsd");
+    }
+  }
   void vmovsd(FloatRegister src, const Address& dest) {
     masm.vmovsd_rm(src.encoding(), dest.offset, dest.base.encoding());
   }
@@ -649,6 +668,18 @@ class AssemblerX86Shared : public AssemblerShared {
     masm.vmovss_mr(src.offset, src.base.encoding(), src.index.encoding(),
                    src.scale, dest.encoding());
   }
+  void vmovss(const Operand& src, FloatRegister dest) {
+    switch (src.kind()) {
+      case Operand::MEM_REG_DISP:
+        vmovss(src.toAddress(), dest);
+        break;
+      case Operand::MEM_SCALE:
+        vmovss(src.toBaseIndex(), dest);
+        break;
+      default:
+        MOZ_CRASH("Unknown operand for vmovss");
+    }
+  }
   void vmovss(FloatRegister src, const Address& dest) {
     masm.vmovss_rm(src.encoding(), dest.offset, dest.base.encoding());
   }
@@ -2958,6 +2989,16 @@ class AssemblerX86Shared : public AssemblerShared {
         MOZ_CRASH("unexpected operand kind");
     }
   }
+  void vpmaddwd(const Operand& src1, FloatRegister src0, FloatRegister dest) {
+    MOZ_ASSERT(HasSSE2());
+    switch (src1.kind()) {
+      case Operand::FPREG:
+        masm.vpmaddwd_rr(src1.fpu(), src0.encoding(), dest.encoding());
+        break;
+      default:
+        MOZ_CRASH("unexpected operand kind");
+    }
+  }
   void vpaddq(const Operand& src1, FloatRegister src0, FloatRegister dest) {
     MOZ_ASSERT(HasSSE2());
     switch (src1.kind()) {
@@ -3941,6 +3982,28 @@ class AssemblerX86Shared : public AssemblerShared {
     MOZ_ASSERT(HasSSE2());
     masm.vsqrtss_rr(src1.encoding(), src0.encoding(), dest.encoding());
   }
+  void vroundps(SSERoundingMode mode, const Operand& src, FloatRegister dest) {
+    MOZ_ASSERT(HasSSE41());
+    switch (src.kind()) {
+      case Operand::FPREG:
+        masm.vroundps_irr((X86Encoding::SSERoundingMode)mode, src.fpu(),
+                          dest.encoding());
+        break;
+      default:
+        MOZ_CRASH("unexpected operand kind");
+    }
+  }
+  void vroundpd(SSERoundingMode mode, const Operand& src, FloatRegister dest) {
+    MOZ_ASSERT(HasSSE41());
+    switch (src.kind()) {
+      case Operand::FPREG:
+        masm.vroundpd_irr((X86Encoding::SSERoundingMode)mode, src.fpu(),
+                          dest.encoding());
+        break;
+      default:
+        MOZ_CRASH("unexpected operand kind");
+    }
+  }
 
   static X86Encoding::RoundingMode ToX86RoundingMode(RoundingMode mode) {
     switch (mode) {
@@ -3955,15 +4018,15 @@ class AssemblerX86Shared : public AssemblerShared {
     }
     MOZ_CRASH("unexpected mode");
   }
-  void vroundsd(X86Encoding::RoundingMode mode, FloatRegister src1,
-                FloatRegister src0, FloatRegister dest) {
+  void vroundsd(X86Encoding::RoundingMode mode, FloatRegister src,
+                FloatRegister dest) {
     MOZ_ASSERT(HasSSE41());
-    masm.vroundsd_irr(mode, src1.encoding(), src0.encoding(), dest.encoding());
+    masm.vroundsd_irr(mode, src.encoding(), dest.encoding());
   }
-  void vroundss(X86Encoding::RoundingMode mode, FloatRegister src1,
-                FloatRegister src0, FloatRegister dest) {
+  void vroundss(X86Encoding::RoundingMode mode, FloatRegister src,
+                FloatRegister dest) {
     MOZ_ASSERT(HasSSE41());
-    masm.vroundss_irr(mode, src1.encoding(), src0.encoding(), dest.encoding());
+    masm.vroundss_irr(mode, src.encoding(), dest.encoding());
   }
 
   unsigned vinsertpsMask(unsigned sourceLane, unsigned destLane,
diff --git a/js/src/jit/x86-shared/BaseAssembler-x86-shared.h b/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
index 16bba3c6e24e0..19df5863543ec 100644
--- a/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
@@ -744,6 +744,9 @@ class BaseAssembler : public GenericAssembler {
     twoByteOpSimd("vpmuludq", VEX_PD, OP2_PMULUDQ_VdqWdq, offset, base, src0,
                   dst);
   }
+  void vpmaddwd_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst) {
+    twoByteOpSimd("vpmaddwd", VEX_PD, OP2_PMADDWD_VdqWdq, src1, src0, dst);
+  }
 
   void vpmullw_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst) {
     twoByteOpSimd("vpmullw", VEX_PD, OP2_PMULLW_VdqWdq, src1, src0, dst);
@@ -3461,16 +3464,24 @@ class BaseAssembler : public GenericAssembler {
     twoByteOpSimd("vsqrtss", VEX_SS, OP2_SQRTSS_VssWss, src1, src0, dst);
   }
 
-  void vroundsd_irr(RoundingMode mode, XMMRegisterID src1, XMMRegisterID src0,
-                    XMMRegisterID dst) {
+  void vroundsd_irr(RoundingMode mode, XMMRegisterID src, XMMRegisterID dst) {
     threeByteOpImmSimd("vroundsd", VEX_PD, OP3_ROUNDSD_VsdWsd, ESCAPE_3A, mode,
-                       src1, src0, dst);
+                       src, invalid_xmm, dst);
   }
 
-  void vroundss_irr(RoundingMode mode, XMMRegisterID src1, XMMRegisterID src0,
-                    XMMRegisterID dst) {
+  void vroundss_irr(RoundingMode mode, XMMRegisterID src, XMMRegisterID dst) {
     threeByteOpImmSimd("vroundss", VEX_PD, OP3_ROUNDSS_VsdWsd, ESCAPE_3A, mode,
-                       src1, src0, dst);
+                       src, invalid_xmm, dst);
+  }
+  void vroundps_irr(SSERoundingMode mode, XMMRegisterID src,
+                    XMMRegisterID dst) {
+    threeByteOpImmSimd("vroundps", VEX_PD, OP3_ROUNDPS_VpsWps, ESCAPE_3A,
+                       int(mode), src, invalid_xmm, dst);
+  }
+  void vroundpd_irr(SSERoundingMode mode, XMMRegisterID src,
+                    XMMRegisterID dst) {
+    threeByteOpImmSimd("vroundpd", VEX_PD, OP3_ROUNDPD_VpdWpd, ESCAPE_3A,
+                       int(mode), src, invalid_xmm, dst);
   }
 
   void vinsertps_irr(uint32_t mask, XMMRegisterID src1, XMMRegisterID src0,
diff --git a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
index 7ecf93d3cb1b8..23225efce133e 100644
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@@ -1923,8 +1923,7 @@ void CodeGenerator::visitNearbyInt(LNearbyInt* lir) {
   FloatRegister output = ToFloatRegister(lir->output());
 
   RoundingMode roundingMode = lir->mir()->roundingMode();
-  masm.vroundsd(Assembler::ToX86RoundingMode(roundingMode), input, output,
-                output);
+  masm.vroundsd(Assembler::ToX86RoundingMode(roundingMode), input, output);
 }
 
 void CodeGenerator::visitNearbyIntF(LNearbyIntF* lir) {
@@ -1932,8 +1931,7 @@ void CodeGenerator::visitNearbyIntF(LNearbyIntF* lir) {
   FloatRegister output = ToFloatRegister(lir->output());
 
   RoundingMode roundingMode = lir->mir()->roundingMode();
-  masm.vroundss(Assembler::ToX86RoundingMode(roundingMode), input, output,
-                output);
+  masm.vroundss(Assembler::ToX86RoundingMode(roundingMode), input, output);
 }
 
 void CodeGenerator::visitEffectiveAddress(LEffectiveAddress* ins) {
@@ -2591,6 +2589,21 @@ void CodeGenerator::visitWasmBinarySimd128(LWasmBinarySimd128* ins) {
     case wasm::SimdOp::F64x2Ge:
       masm.compareFloat64x2(Assembler::GreaterThanOrEqual, rhs, lhsDest);
       break;
+    case wasm::SimdOp::F32x4PMaxExperimental:
+      masm.pseudoMaxFloat32x4(rhs, lhsDest);
+      break;
+    case wasm::SimdOp::F32x4PMinExperimental:
+      masm.pseudoMinFloat32x4(rhs, lhsDest);
+      break;
+    case wasm::SimdOp::F64x2PMaxExperimental:
+      masm.pseudoMaxFloat64x2(rhs, lhsDest);
+      break;
+    case wasm::SimdOp::F64x2PMinExperimental:
+      masm.pseudoMinFloat64x2(rhs, lhsDest);
+      break;
+    case wasm::SimdOp::I32x4DotSI16x8Experimental:
+      masm.widenDotInt16x8(rhs, lhsDest);
+      break;
     default:
       MOZ_CRASH("Binary SimdOp not implemented");
   }
@@ -3118,6 +3131,30 @@ void CodeGenerator::visitWasmUnarySimd128(LWasmUnarySimd128* ins) {
     case wasm::SimdOp::I32x4Abs:
       masm.absInt32x4(src, dest);
       break;
+    case wasm::SimdOp::F32x4CeilExperimental:
+      masm.ceilFloat32x4(src, dest);
+      break;
+    case wasm::SimdOp::F32x4FloorExperimental:
+      masm.floorFloat32x4(src, dest);
+      break;
+    case wasm::SimdOp::F32x4TruncExperimental:
+      masm.truncFloat32x4(src, dest);
+      break;
+    case wasm::SimdOp::F32x4NearestExperimental:
+      masm.nearestFloat32x4(src, dest);
+      break;
+    case wasm::SimdOp::F64x2CeilExperimental:
+      masm.ceilFloat64x2(src, dest);
+      break;
+    case wasm::SimdOp::F64x2FloorExperimental:
+      masm.floorFloat64x2(src, dest);
+      break;
+    case wasm::SimdOp::F64x2TruncExperimental:
+      masm.truncFloat64x2(src, dest);
+      break;
+    case wasm::SimdOp::F64x2NearestExperimental:
+      masm.nearestFloat64x2(src, dest);
+      break;
     default:
       MOZ_CRASH("Unary SimdOp not implemented");
   }
diff --git a/js/src/jit/x86-shared/Constants-x86-shared.h b/js/src/jit/x86-shared/Constants-x86-shared.h
index e26d856664ce7..be16aa671165c 100644
--- a/js/src/jit/x86-shared/Constants-x86-shared.h
+++ b/js/src/jit/x86-shared/Constants-x86-shared.h
@@ -275,7 +275,7 @@ enum ConditionCmp {
   ConditionCmp_ORD = 0x7,
 };
 
-// Rounding modes for ROUNDSD.
+// Rounding modes for ROUNDSS / ROUNDSD.
 enum RoundingMode {
   RoundToNearest = 0x0,
   RoundDown = 0x1,
@@ -283,6 +283,17 @@ enum RoundingMode {
   RoundToZero = 0x3
 };
 
+// Rounding modes for ROUNDPS / ROUNDPD.  Note these are the same as for
+// RoundingMode above but incorporate the 'inexact' bit which says not to signal
+// exceptions for lost precision.  It's not obvious that this bit is needed; it
+// was however suggested in the wasm SIMD proposal that led to these encodings.
+enum class SSERoundingMode {
+  RoundToNearest = 0x08,
+  RoundDown = 0x09,
+  RoundUp = 0x0A,
+  RoundToZero = 0x0B
+};
+
 // Test whether the given address will fit in an address immediate field.
 // This is always true on x86, but on x64 it's only true for addreses which
 // fit in the 32-bit immediate field.
diff --git a/js/src/jit/x86-shared/Encoding-x86-shared.h b/js/src/jit/x86-shared/Encoding-x86-shared.h
index 25d499b7811d0..185665096aaf7 100644
--- a/js/src/jit/x86-shared/Encoding-x86-shared.h
+++ b/js/src/jit/x86-shared/Encoding-x86-shared.h
@@ -315,6 +315,7 @@ enum TwoByteOpcodeID {
   OP2_PSLLD_VdqWdq = 0xF2,
   OP2_PSLLQ_VdqWdq = 0xF3,
   OP2_PMULUDQ_VdqWdq = 0xF4,
+  OP2_PMADDWD_VdqWdq = 0xF5,
   OP2_PSUBB_VdqWdq = 0xF8,
   OP2_PSUBW_VdqWdq = 0xF9,
   OP2_PSUBD_VdqWdq = 0xFA,
@@ -326,6 +327,8 @@ enum TwoByteOpcodeID {
 
 enum ThreeByteOpcodeID {
   OP3_PSHUFB_VdqWdq = 0x00,
+  OP3_ROUNDPS_VpsWps = 0x08,
+  OP3_ROUNDPD_VpdWpd = 0x09,
   OP3_ROUNDSS_VsdWsd = 0x0A,
   OP3_ROUNDSD_VsdWsd = 0x0B,
   OP3_BLENDPS_VpsWpsIb = 0x0C,
diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
index cfbd00c5c45e5..64a07d9b4d936 100644
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
@@ -1969,7 +1969,7 @@ void MacroAssembler::minFloat64x2(FloatRegister rhs, FloatRegister lhsDest) {
   MacroAssemblerX86Shared::minFloat64x2(lhsDest, Operand(rhs), lhsDest);
 }
 
-// NaN-propagating maxium
+// NaN-propagating maximum
 
 void MacroAssembler::maxFloat32x4(FloatRegister rhs, FloatRegister lhsDest,
                                   FloatRegister temp) {
@@ -1981,6 +1981,68 @@ void MacroAssembler::maxFloat64x2(FloatRegister rhs, FloatRegister lhsDest,
   MacroAssemblerX86Shared::maxFloat64x2(lhsDest, Operand(rhs), temp, lhsDest);
 }
 
+// Compare-based minimum
+
+void MacroAssembler::pseudoMinFloat32x4(FloatRegister rhs,
+                                        FloatRegister lhsDest) {
+  vminps(Operand(rhs), lhsDest, lhsDest);
+}
+
+void MacroAssembler::pseudoMinFloat64x2(FloatRegister rhs,
+                                        FloatRegister lhsDest) {
+  vminpd(Operand(rhs), lhsDest, lhsDest);
+}
+
+// Compare-based maximum
+
+void MacroAssembler::pseudoMaxFloat32x4(FloatRegister rhs,
+                                        FloatRegister lhsDest) {
+  vmaxps(Operand(rhs), lhsDest, lhsDest);
+}
+
+void MacroAssembler::pseudoMaxFloat64x2(FloatRegister rhs,
+                                        FloatRegister lhsDest) {
+  vmaxpd(Operand(rhs), lhsDest, lhsDest);
+}
+
+void MacroAssembler::widenDotInt16x8(FloatRegister rhs, FloatRegister lhsDest) {
+  vpmaddwd(Operand(rhs), lhsDest, lhsDest);
+}
+
+// Rounding
+
+void MacroAssembler::ceilFloat32x4(FloatRegister src, FloatRegister dest) {
+  vroundps(Assembler::SSERoundingMode::Ceil, Operand(src), dest);
+}
+
+void MacroAssembler::ceilFloat64x2(FloatRegister src, FloatRegister dest) {
+  vroundpd(Assembler::SSERoundingMode::Ceil, Operand(src), dest);
+}
+
+void MacroAssembler::floorFloat32x4(FloatRegister src, FloatRegister dest) {
+  vroundps(Assembler::SSERoundingMode::Floor, Operand(src), dest);
+}
+
+void MacroAssembler::floorFloat64x2(FloatRegister src, FloatRegister dest) {
+  vroundpd(Assembler::SSERoundingMode::Floor, Operand(src), dest);
+}
+
+void MacroAssembler::truncFloat32x4(FloatRegister src, FloatRegister dest) {
+  vroundps(Assembler::SSERoundingMode::Trunc, Operand(src), dest);
+}
+
+void MacroAssembler::truncFloat64x2(FloatRegister src, FloatRegister dest) {
+  vroundpd(Assembler::SSERoundingMode::Trunc, Operand(src), dest);
+}
+
+void MacroAssembler::nearestFloat32x4(FloatRegister src, FloatRegister dest) {
+  vroundps(Assembler::SSERoundingMode::Nearest, Operand(src), dest);
+}
+
+void MacroAssembler::nearestFloat64x2(FloatRegister src, FloatRegister dest) {
+  vroundpd(Assembler::SSERoundingMode::Nearest, Operand(src), dest);
+}
+
 // Floating add
 
 void MacroAssembler::addFloat32x4(FloatRegister rhs, FloatRegister lhsDest) {
diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared.cpp b/js/src/jit/x86-shared/MacroAssembler-x86-shared.cpp
index f992ee4533107..ca0501e1a410b 100644
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared.cpp
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared.cpp
@@ -1565,7 +1565,7 @@ void MacroAssembler::floorFloat32ToInt32(FloatRegister src, Register dest,
     // Round toward -Infinity.
     {
       ScratchFloat32Scope scratch(*this);
-      vroundss(X86Encoding::RoundDown, src, scratch, scratch);
+      vroundss(X86Encoding::RoundDown, src, scratch);
       truncateFloat32ToInt32(scratch, dest, fail);
     }
   } else {
@@ -1620,7 +1620,7 @@ void MacroAssembler::floorDoubleToInt32(FloatRegister src, Register dest,
     // Round toward -Infinity.
     {
       ScratchDoubleScope scratch(*this);
-      vroundsd(X86Encoding::RoundDown, src, scratch, scratch);
+      vroundsd(X86Encoding::RoundDown, src, scratch);
       truncateDoubleToInt32(scratch, dest, fail);
     }
   } else {
@@ -1684,7 +1684,7 @@ void MacroAssembler::ceilFloat32ToInt32(FloatRegister src, Register dest,
     // x <= -1 or x > -0
     bind(&lessThanOrEqualMinusOne);
     // Round toward +Infinity.
-    vroundss(X86Encoding::RoundUp, src, scratch, scratch);
+    vroundss(X86Encoding::RoundUp, src, scratch);
     truncateFloat32ToInt32(scratch, dest, fail);
     return;
   }
@@ -1729,7 +1729,7 @@ void MacroAssembler::ceilDoubleToInt32(FloatRegister src, Register dest,
     // x <= -1 or x > -0
     bind(&lessThanOrEqualMinusOne);
     // Round toward +Infinity.
-    vroundsd(X86Encoding::RoundUp, src, scratch, scratch);
+    vroundsd(X86Encoding::RoundUp, src, scratch);
     truncateDoubleToInt32(scratch, dest, fail);
     return;
   }
@@ -1848,7 +1848,7 @@ void MacroAssembler::roundFloat32ToInt32(FloatRegister src, Register dest,
       // Add 0.5 and round toward -Infinity. The result is stored in the temp
       // register (currently contains 0.5).
       addFloat32(src, temp);
-      vroundss(X86Encoding::RoundDown, temp, scratch, scratch);
+      vroundss(X86Encoding::RoundDown, temp, scratch);
 
       // Truncate.
       truncateFloat32ToInt32(scratch, dest, fail);
@@ -1934,7 +1934,7 @@ void MacroAssembler::roundDoubleToInt32(FloatRegister src, Register dest,
       // Add 0.5 and round toward -Infinity. The result is stored in the temp
       // register (currently contains 0.5).
       addDouble(src, temp);
-      vroundsd(X86Encoding::RoundDown, temp, scratch, scratch);
+      vroundsd(X86Encoding::RoundDown, temp, scratch);
 
       // Truncate.
       truncateDoubleToInt32(scratch, dest, fail);
diff --git a/js/src/jit/x86/MacroAssembler-x86.cpp b/js/src/jit/x86/MacroAssembler-x86.cpp
index 6d7de08999db8..bae89fac08f15 100644
--- a/js/src/jit/x86/MacroAssembler-x86.cpp
+++ b/js/src/jit/x86/MacroAssembler-x86.cpp
@@ -614,9 +614,11 @@ void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
       movl(srcAddr, out.gpr());
       break;
     case Scalar::Float32:
+      // vmovss does the right thing also for access.isZeroExtendSimdLoad()
       vmovss(srcAddr, out.fpu());
       break;
     case Scalar::Float64:
+      // vmovsd does the right thing also for access.isZeroExtendSimdLoad()
       vmovsd(srcAddr, out.fpu());
       break;
     case Scalar::Simd128:
diff --git a/js/src/wasm/WasmBaselineCompile.cpp b/js/src/wasm/WasmBaselineCompile.cpp
index be4cf614a4652..0207f7d461fd8 100644
--- a/js/src/wasm/WasmBaselineCompile.cpp
+++ b/js/src/wasm/WasmBaselineCompile.cpp
@@ -6380,7 +6380,7 @@ class BaseCompiler final : public BaseCompilerInterface {
 
   void roundF32(RoundingMode roundingMode, RegF32 f0) {
 #if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86)
-    masm.vroundss(Assembler::ToX86RoundingMode(roundingMode), f0, f0, f0);
+    masm.vroundss(Assembler::ToX86RoundingMode(roundingMode), f0, f0);
 #else
     MOZ_CRASH("NYI");
 #endif
@@ -6388,7 +6388,7 @@ class BaseCompiler final : public BaseCompilerInterface {
 
   void roundF64(RoundingMode roundingMode, RegF64 f0) {
 #if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86)
-    masm.vroundsd(Assembler::ToX86RoundingMode(roundingMode), f0, f0, f0);
+    masm.vroundsd(Assembler::ToX86RoundingMode(roundingMode), f0, f0);
 #else
     MOZ_CRASH("NYI");
 #endif
@@ -8140,6 +8140,7 @@ class BaseCompiler final : public BaseCompilerInterface {
   void emitVectorAndNot();
 
   MOZ_MUST_USE bool emitLoadSplat(Scalar::Type viewType);
+  MOZ_MUST_USE bool emitLoadZero(Scalar::Type viewType);
   MOZ_MUST_USE bool emitLoadExtend(Scalar::Type viewType);
   MOZ_MUST_USE bool emitBitselect();
   MOZ_MUST_USE bool emitVectorShuffle();
@@ -12776,6 +12777,26 @@ static void MaxF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
   masm.maxFloat64x2(rs, rsd, temp);
 }
 
+static void PMinF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
+  masm.pseudoMinFloat32x4(rs, rsd);
+}
+
+static void PMinF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
+  masm.pseudoMinFloat64x2(rs, rsd);
+}
+
+static void PMaxF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
+  masm.pseudoMaxFloat32x4(rs, rsd);
+}
+
+static void PMaxF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
+  masm.pseudoMaxFloat64x2(rs, rsd);
+}
+
+static void DotI16x8(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
+  masm.widenDotInt16x8(rs, rsd);
+}
+
 static void CmpI8x16(MacroAssembler& masm, Assembler::Condition cond,
                      RegV128 rs, RegV128 rsd) {
   masm.compareInt8x16(cond, rs, rsd);
@@ -12856,6 +12877,38 @@ static void SqrtF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rd) {
   masm.sqrtFloat64x2(rs, rd);
 }
 
+static void CeilF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rd) {
+  masm.ceilFloat32x4(rs, rd);
+}
+
+static void FloorF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rd) {
+  masm.floorFloat32x4(rs, rd);
+}
+
+static void TruncF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rd) {
+  masm.truncFloat32x4(rs, rd);
+}
+
+static void NearestF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rd) {
+  masm.nearestFloat32x4(rs, rd);
+}
+
+static void CeilF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rd) {
+  masm.ceilFloat64x2(rs, rd);
+}
+
+static void FloorF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rd) {
+  masm.floorFloat64x2(rs, rd);
+}
+
+static void TruncF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rd) {
+  masm.truncFloat64x2(rs, rd);
+}
+
+static void NearestF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rd) {
+  masm.nearestFloat64x2(rs, rd);
+}
+
 static void NotV128(MacroAssembler& masm, RegV128 rs, RegV128 rd) {
   masm.bitwiseNotSimd128(rs, rd);
 }
@@ -13355,6 +13408,22 @@ bool BaseCompiler::emitLoadSplat(Scalar::Type viewType) {
   return true;
 }
 
+bool BaseCompiler::emitLoadZero(Scalar::Type viewType) {
+  // LoadZero has the structure of LoadSplat
+  LinearMemoryAddress<Nothing> addr;
+  if (!iter_.readLoadSplat(Scalar::byteSize(viewType), &addr)) {
+    return false;
+  }
+
+  if (deadCode_) {
+    return true;
+  }
+
+  MemoryAccessDesc access(viewType, addr.align, addr.offset, bytecodeOffset());
+  access.setZeroExtendSimd128Load();
+  return loadCommon(&access, AccessCheck(), ValType::V128);
+}
+
 bool BaseCompiler::emitLoadExtend(Scalar::Type viewType) {
   LinearMemoryAddress<Nothing> addr;
   if (!iter_.readLoadExtend(&addr)) {
@@ -13580,6 +13649,11 @@ bool BaseCompiler::emitBody() {
     } while (0)
 #endif
 
+#define CHECK_EXPERIMENTAL_SIMD() \
+  if (!SimdExperimentalEnabled) { \
+    break;                        \
+  }
+
 #define CHECK(E) \
   if (!(E)) return false
 #define NEXT()           \
@@ -14534,6 +14608,21 @@ bool BaseCompiler::emitBody() {
             CHECK_NEXT(dispatchVectorBinary(emitVectorBinop, NarrowUI32x4));
           case uint32_t(SimdOp::V8x16Swizzle):
             CHECK_NEXT(dispatchVectorBinary(emitVectorBinopWithTemp, Swizzle));
+          case uint32_t(SimdOp::F32x4PMaxExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK_NEXT(dispatchVectorBinary(emitVectorBinop, PMaxF32x4));
+          case uint32_t(SimdOp::F32x4PMinExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK_NEXT(dispatchVectorBinary(emitVectorBinop, PMinF32x4));
+          case uint32_t(SimdOp::F64x2PMaxExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK_NEXT(dispatchVectorBinary(emitVectorBinop, PMaxF64x2));
+          case uint32_t(SimdOp::F64x2PMinExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK_NEXT(dispatchVectorBinary(emitVectorBinop, PMinF64x2));
+          case uint32_t(SimdOp::I32x4DotSI16x8Experimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK_NEXT(dispatchVectorBinary(emitVectorBinop, DotI16x8));
           case uint32_t(SimdOp::I8x16Neg):
             CHECK_NEXT(dispatchVectorUnary(emitVectorUnop, NegI8x16));
           case uint32_t(SimdOp::I16x8Neg):
@@ -14590,6 +14679,30 @@ bool BaseCompiler::emitBody() {
             CHECK_NEXT(dispatchVectorUnary(emitVectorUnop, AbsI16x8));
           case uint32_t(SimdOp::I32x4Abs):
             CHECK_NEXT(dispatchVectorUnary(emitVectorUnop, AbsI32x4));
+          case uint32_t(SimdOp::F32x4CeilExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK_NEXT(dispatchVectorUnary(emitVectorUnop, CeilF32x4));
+          case uint32_t(SimdOp::F32x4FloorExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK_NEXT(dispatchVectorUnary(emitVectorUnop, FloorF32x4));
+          case uint32_t(SimdOp::F32x4TruncExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK_NEXT(dispatchVectorUnary(emitVectorUnop, TruncF32x4));
+          case uint32_t(SimdOp::F32x4NearestExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK_NEXT(dispatchVectorUnary(emitVectorUnop, NearestF32x4));
+          case uint32_t(SimdOp::F64x2CeilExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK_NEXT(dispatchVectorUnary(emitVectorUnop, CeilF64x2));
+          case uint32_t(SimdOp::F64x2FloorExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK_NEXT(dispatchVectorUnary(emitVectorUnop, FloorF64x2));
+          case uint32_t(SimdOp::F64x2TruncExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK_NEXT(dispatchVectorUnary(emitVectorUnop, TruncF64x2));
+          case uint32_t(SimdOp::F64x2NearestExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK_NEXT(dispatchVectorUnary(emitVectorUnop, NearestF64x2));
           case uint32_t(SimdOp::I8x16Shl):
             CHECK_NEXT(dispatchVectorVariableShiftWithTwoTemps(ShiftLeftI8x16));
           case uint32_t(SimdOp::I8x16ShrS):
@@ -14650,6 +14763,12 @@ bool BaseCompiler::emitBody() {
             CHECK_NEXT(emitLoadExtend(Scalar::Int32));
           case uint32_t(SimdOp::I64x2LoadU32x2):
             CHECK_NEXT(emitLoadExtend(Scalar::Uint32));
+          case uint32_t(SimdOp::V128Load32ZeroExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK_NEXT(emitLoadZero(Scalar::Float32));
+          case uint32_t(SimdOp::V128Load64ZeroExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK_NEXT(emitLoadZero(Scalar::Float64));
           case uint32_t(SimdOp::V128Store):
             CHECK_NEXT(emitStore(ValType::V128, Scalar::Simd128));
           default:
@@ -14954,6 +15073,7 @@ bool BaseCompiler::emitBody() {
 #undef NEXT
 #undef CHECK_NEXT
 #undef CHECK_POINTER_COUNT
+#undef CHECK_EXPERIMENTAL_SIMD
 #undef dispatchBinary
 #undef dispatchUnary
 #undef dispatchComparison
diff --git a/js/src/wasm/WasmConstants.h b/js/src/wasm/WasmConstants.h
index 7bedc84cf20cf..dcb69c9c91723 100644
--- a/js/src/wasm/WasmConstants.h
+++ b/js/src/wasm/WasmConstants.h
@@ -404,6 +404,16 @@ enum class GcOp {
 };
 
 // Opcode list from the SIMD proposal post-renumbering in May, 2020.
+
+// Opcodes with suffix 'Experimental' are proposed but not standardized, and are
+// compatible with those same opcodes in V8.  No opcode labeled 'Experimental'
+// will ship in a Release build where SIMD is enabled by default.
+//
+// Once SIMD ships default-on in release builds, the following flag must be set
+// to false for RELEASE_OR_BETA.
+
+static constexpr bool SimdExperimentalEnabled = true;
+
 enum class SimdOp {
   V128Load = 0x00,
   I16x8LoadS8x8 = 0x01,
@@ -591,7 +601,7 @@ enum class SimdOp {
   I32x4MinU = 0xb7,
   I32x4MaxS = 0xb8,
   I32x4MaxU = 0xb9,
-  // AvgrS = 0xba
+  I32x4DotSI16x8Experimental = 0xba,
   // AvgrU = 0xbb
   // Unused = 0xbc
   // Unused = 0xbd
@@ -621,14 +631,14 @@ enum class SimdOp {
   I64x2Mul = 0xd5,
   // MinS = 0xd6
   // MinU = 0xd7
-  // MaxS = 0xd8
-  // MaxU = 0xd9
-  // AvgrS = 0xda
-  // AvgrU = 0xdb
-  // Unused = 0xdc
-  // Unused = 0xdd
-  // Unused = 0xde
-  // Unused = 0xdf
+  F32x4CeilExperimental = 0xd8,
+  F32x4FloorExperimental = 0xd9,
+  F32x4TruncExperimental = 0xda,
+  F32x4NearestExperimental = 0xdb,
+  F64x2CeilExperimental = 0xdc,
+  F64x2FloorExperimental = 0xdd,
+  F64x2TruncExperimental = 0xde,
+  F64x2NearestExperimental = 0xdf,
   F32x4Abs = 0xe0,
   F32x4Neg = 0xe1,
   // Round = 0xe2
@@ -639,8 +649,8 @@ enum class SimdOp {
   F32x4Div = 0xe7,
   F32x4Min = 0xe8,
   F32x4Max = 0xe9,
-  // PMin = 0xea
-  // PMax = 0xeb
+  F32x4PMinExperimental = 0xea,
+  F32x4PMaxExperimental = 0xeb,
   F64x2Abs = 0xec,
   F64x2Neg = 0xed,
   // Round = 0xee
@@ -651,13 +661,15 @@ enum class SimdOp {
   F64x2Div = 0xf3,
   F64x2Min = 0xf4,
   F64x2Max = 0xf5,
-  // PMin = 0xf6
-  // PMax = 0xf7
+  F64x2PMinExperimental = 0xf6,
+  F64x2PMaxExperimental = 0xf7,
   I32x4TruncSSatF32x4 = 0xf8,
   I32x4TruncUSatF32x4 = 0xf9,
   F32x4ConvertSI32x4 = 0xfa,
   F32x4ConvertUI32x4 = 0xfb,
-  // Unused = 0xfc and up
+  V128Load32ZeroExperimental = 0xfc,
+  V128Load64ZeroExperimental = 0xfd,
+  // Unused = 0xfe and up
 
   Limit
 };
diff --git a/js/src/wasm/WasmIonCompile.cpp b/js/src/wasm/WasmIonCompile.cpp
index 9bd5508c81532..48433e60a4e1e 100644
--- a/js/src/wasm/WasmIonCompile.cpp
+++ b/js/src/wasm/WasmIonCompile.cpp
@@ -809,6 +809,18 @@ class FunctionCompiler {
     }
     return scalarToSimd128(scalar, op);
   }
+
+  MDefinition* loadZeroSimd128(Scalar::Type viewType, size_t numBytes,
+                               const LinearMemoryAddress<MDefinition*>& addr) {
+    if (inDeadCode()) {
+      return nullptr;
+    }
+
+    MemoryAccessDesc access(viewType, addr.align, addr.offset,
+                            bytecodeIfNotAsmJS());
+    access.setZeroExtendSimd128Load();
+    return load(addr.base, &access, ValType::V128);
+  }
 #endif  // ENABLE_WASM_SIMD
 
  private:
@@ -4205,6 +4217,16 @@ static bool EmitLoadExtendSimd128(FunctionCompiler& f, wasm::SimdOp op) {
   return true;
 }
 
+static bool EmitLoadZeroSimd128(FunctionCompiler& f, Scalar::Type viewType,
+                                size_t numBytes) {
+  LinearMemoryAddress<MDefinition*> addr;
+  if (!f.iter().readLoadSplat(numBytes, &addr)) {
+    return false;
+  }
+
+  f.iter().setResult(f.loadZeroSimd128(viewType, numBytes, addr));
+  return true;
+}
 #endif
 
 static bool EmitBodyExprs(FunctionCompiler& f) {
@@ -4216,6 +4238,11 @@ static bool EmitBodyExprs(FunctionCompiler& f) {
   if (!(c)) return false; \
   break
 
+#define CHECK_EXPERIMENTAL_SIMD()            \
+  if (!SimdExperimentalEnabled) {            \
+    return f.iter().unrecognizedOpcode(&op); \
+  }
+
   while (true) {
     if (!f.mirGen().ensureBallast()) {
       return false;
@@ -4747,6 +4774,9 @@ static bool EmitBodyExprs(FunctionCompiler& f) {
           case uint32_t(SimdOp::F64x2Eq):
           case uint32_t(SimdOp::F64x2Ne):
             CHECK(EmitBinarySimd128(f, /* commutative= */ true, SimdOp(op.b1)));
+          case uint32_t(SimdOp::I32x4DotSI16x8Experimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK(EmitBinarySimd128(f, /* commutative= */ true, SimdOp(op.b1)));
           case uint32_t(SimdOp::V128AndNot):
           case uint32_t(SimdOp::I8x16Sub):
           case uint32_t(SimdOp::I8x16SubSaturateS):
@@ -4799,6 +4829,13 @@ static bool EmitBodyExprs(FunctionCompiler& f) {
           case uint32_t(SimdOp::V8x16Swizzle):
             CHECK(
                 EmitBinarySimd128(f, /* commutative= */ false, SimdOp(op.b1)));
+          case uint32_t(SimdOp::F32x4PMaxExperimental):
+          case uint32_t(SimdOp::F32x4PMinExperimental):
+          case uint32_t(SimdOp::F64x2PMaxExperimental):
+          case uint32_t(SimdOp::F64x2PMinExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK(
+                EmitBinarySimd128(f, /* commutative= */ false, SimdOp(op.b1)));
           case uint32_t(SimdOp::I8x16Splat):
           case uint32_t(SimdOp::I16x8Splat):
           case uint32_t(SimdOp::I32x4Splat):
@@ -4836,6 +4873,16 @@ static bool EmitBodyExprs(FunctionCompiler& f) {
           case uint32_t(SimdOp::I16x8Abs):
           case uint32_t(SimdOp::I32x4Abs):
             CHECK(EmitUnarySimd128(f, SimdOp(op.b1)));
+          case uint32_t(SimdOp::F32x4CeilExperimental):
+          case uint32_t(SimdOp::F32x4FloorExperimental):
+          case uint32_t(SimdOp::F32x4TruncExperimental):
+          case uint32_t(SimdOp::F32x4NearestExperimental):
+          case uint32_t(SimdOp::F64x2CeilExperimental):
+          case uint32_t(SimdOp::F64x2FloorExperimental):
+          case uint32_t(SimdOp::F64x2TruncExperimental):
+          case uint32_t(SimdOp::F64x2NearestExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK(EmitUnarySimd128(f, SimdOp(op.b1)));
           case uint32_t(SimdOp::I8x16AnyTrue):
           case uint32_t(SimdOp::I16x8AnyTrue):
           case uint32_t(SimdOp::I32x4AnyTrue):
@@ -4904,6 +4951,12 @@ static bool EmitBodyExprs(FunctionCompiler& f) {
           case uint32_t(SimdOp::I64x2LoadS32x2):
           case uint32_t(SimdOp::I64x2LoadU32x2):
             CHECK(EmitLoadExtendSimd128(f, SimdOp(op.b1)));
+          case uint32_t(SimdOp::V128Load32ZeroExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK(EmitLoadZeroSimd128(f, Scalar::Float32, 4));
+          case uint32_t(SimdOp::V128Load64ZeroExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK(EmitLoadZeroSimd128(f, Scalar::Float64, 8));
           default:
             return f.iter().unrecognizedOpcode(&op);
         }  // switch (op.b1)
@@ -5232,6 +5285,7 @@ static bool EmitBodyExprs(FunctionCompiler& f) {
   MOZ_CRASH("unreachable");
 
 #undef CHECK
+#undef CHECK_EXPERIMENTAL_SIMD
 }
 
 bool wasm::IonCompileFunctions(const ModuleEnvironment& env, LifoAlloc& lifo,
diff --git a/js/src/wasm/WasmOpIter.cpp b/js/src/wasm/WasmOpIter.cpp
index 69ccd4984be40..a41747eecf232 100644
--- a/js/src/wasm/WasmOpIter.cpp
+++ b/js/src/wasm/WasmOpIter.cpp
@@ -421,6 +421,11 @@ OpKind wasm::Classify(OpBytes op) {
         case SimdOp::I16x8NarrowSI32x4:
         case SimdOp::I16x8NarrowUI32x4:
         case SimdOp::V8x16Swizzle:
+        case SimdOp::F32x4PMinExperimental:
+        case SimdOp::F32x4PMaxExperimental:
+        case SimdOp::F64x2PMinExperimental:
+        case SimdOp::F64x2PMaxExperimental:
+        case SimdOp::I32x4DotSI16x8Experimental:
           WASM_SIMD_OP(OpKind::Binary);
         case SimdOp::I8x16Neg:
         case SimdOp::I16x8Neg:
@@ -448,6 +453,14 @@ OpKind wasm::Classify(OpBytes op) {
         case SimdOp::I8x16Abs:
         case SimdOp::I16x8Abs:
         case SimdOp::I32x4Abs:
+        case SimdOp::F32x4CeilExperimental:
+        case SimdOp::F32x4FloorExperimental:
+        case SimdOp::F32x4TruncExperimental:
+        case SimdOp::F32x4NearestExperimental:
+        case SimdOp::F64x2CeilExperimental:
+        case SimdOp::F64x2FloorExperimental:
+        case SimdOp::F64x2TruncExperimental:
+        case SimdOp::F64x2NearestExperimental:
           WASM_SIMD_OP(OpKind::Unary);
         case SimdOp::I8x16Shl:
         case SimdOp::I8x16ShrS:
@@ -479,6 +492,8 @@ OpKind wasm::Classify(OpBytes op) {
         case SimdOp::I32x4LoadU16x4:
         case SimdOp::I64x2LoadS32x2:
         case SimdOp::I64x2LoadU32x2:
+        case SimdOp::V128Load32ZeroExperimental:
+        case SimdOp::V128Load64ZeroExperimental:
           WASM_SIMD_OP(OpKind::Load);
         case SimdOp::V128Store:
           WASM_SIMD_OP(OpKind::Store);
diff --git a/js/src/wasm/WasmValidate.cpp b/js/src/wasm/WasmValidate.cpp
index ac91aa86b28ee..80a96fe86b70f 100644
--- a/js/src/wasm/WasmValidate.cpp
+++ b/js/src/wasm/WasmValidate.cpp
@@ -478,6 +478,11 @@ static bool DecodeFunctionBodyExprs(const ModuleEnvironment& env,
   if (!(c)) return false; \
   break
 
+#define CHECK_EXPERIMENTAL_SIMD()        \
+  if (!SimdExperimentalEnabled) {        \
+    return iter.unrecognizedOpcode(&op); \
+  }
+
   while (true) {
     OpBytes op;
     if (!iter.readOp(&op)) {
@@ -1043,6 +1048,14 @@ static bool DecodeFunctionBodyExprs(const ModuleEnvironment& env,
           case uint32_t(SimdOp::V8x16Swizzle):
             CHECK(iter.readBinary(ValType::V128, &nothing, &nothing));
 
+          case uint32_t(SimdOp::F32x4PMaxExperimental):
+          case uint32_t(SimdOp::F32x4PMinExperimental):
+          case uint32_t(SimdOp::F64x2PMaxExperimental):
+          case uint32_t(SimdOp::F64x2PMinExperimental):
+          case uint32_t(SimdOp::I32x4DotSI16x8Experimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK(iter.readBinary(ValType::V128, &nothing, &nothing));
+
           case uint32_t(SimdOp::I8x16Neg):
           case uint32_t(SimdOp::I16x8Neg):
           case uint32_t(SimdOp::I16x8WidenLowSI8x16):
@@ -1071,6 +1084,17 @@ static bool DecodeFunctionBodyExprs(const ModuleEnvironment& env,
           case uint32_t(SimdOp::I32x4Abs):
             CHECK(iter.readUnary(ValType::V128, &nothing));
 
+          case uint32_t(SimdOp::F32x4CeilExperimental):
+          case uint32_t(SimdOp::F32x4FloorExperimental):
+          case uint32_t(SimdOp::F32x4TruncExperimental):
+          case uint32_t(SimdOp::F32x4NearestExperimental):
+          case uint32_t(SimdOp::F64x2CeilExperimental):
+          case uint32_t(SimdOp::F64x2FloorExperimental):
+          case uint32_t(SimdOp::F64x2TruncExperimental):
+          case uint32_t(SimdOp::F64x2NearestExperimental):
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK(iter.readUnary(ValType::V128, &nothing));
+
           case uint32_t(SimdOp::I8x16Shl):
           case uint32_t(SimdOp::I8x16ShrS):
           case uint32_t(SimdOp::I8x16ShrU):
@@ -1146,6 +1170,18 @@ static bool DecodeFunctionBodyExprs(const ModuleEnvironment& env,
             CHECK(iter.readStore(ValType::V128, 16, &addr, &nothing));
           }
 
+          case uint32_t(SimdOp::V128Load32ZeroExperimental): {
+            LinearMemoryAddress<Nothing> addr;
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK(iter.readLoadSplat(4, &addr));
+          }
+
+          case uint32_t(SimdOp::V128Load64ZeroExperimental): {
+            LinearMemoryAddress<Nothing> addr;
+            CHECK_EXPERIMENTAL_SIMD();
+            CHECK(iter.readLoadSplat(8, &addr));
+          }
+
           default:
             return iter.unrecognizedOpcode(&op);
         }
@@ -1450,6 +1486,7 @@ static bool DecodeFunctionBodyExprs(const ModuleEnvironment& env,
   MOZ_CRASH("unreachable");
 
 #undef CHECK
+#undef CHECK_EXPERIMENTAL_SIMD
 }
 
 bool wasm::ValidateFunctionBody(const ModuleEnvironment& env,