From 31e944556f5464b5a488c42f1d727d5b27734169 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 28 Oct 2020 09:38:59 -0700
Subject: [PATCH] [WebAssembly] Prototype extending multiplication SIMD
 instructions

As proposed in https://github.com/WebAssembly/simd/pull/376. This commit
implements new builtin functions and intrinsics for these instructions, but does
not yet add them to wasm_simd128.h because they have not yet been merged to the
proposal. These are the first instructions with opcodes greater than 0xff, so
this commit updates the MC layer and disassembler to handle that correctly.

Differential Revision: https://reviews.llvm.org/D90253
---
 .../clang/Basic/BuiltinsWebAssembly.def       |  15 ++
 clang/lib/CodeGen/CGBuiltin.cpp               |  43 ++++++
 clang/test/CodeGen/builtins-wasm.c            |  84 ++++++++++
 llvm/include/llvm/IR/IntrinsicsWebAssembly.td |  17 ++
 .../MCTargetDesc/WebAssemblyMCCodeEmitter.cpp |  10 +-
 .../WebAssembly/WebAssemblyInstrSIMD.td       |  55 ++++++-
 .../CodeGen/WebAssembly/simd-intrinsics.ll    | 145 ++++++++++++++++++
 llvm/test/MC/WebAssembly/simd-encodings.s     |  36 +++++
 .../WebAssemblyDisassemblerEmitter.cpp        |  12 +-
 9 files changed, 410 insertions(+), 7 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index f84ce92ffe7e..1ee978101095 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -118,6 +118,21 @@ TARGET_BUILTIN(__builtin_wasm_popcnt_i8x16, "V16ScV16Sc", "nc", "simd128")
 
 TARGET_BUILTIN(__builtin_wasm_q15mulr_saturate_s_i8x16, "V8sV8sV8s", "nc", "simd128")
 
+TARGET_BUILTIN(__builtin_wasm_extmul_low_i8x16_s_i16x8, "V8sV16ScV16Sc", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_extmul_high_i8x16_s_i16x8, "V8sV16ScV16Sc", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_extmul_low_i8x16_u_i16x8, "V8UsV16UcV16Uc", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_extmul_high_i8x16_u_i16x8, "V8UsV16UcV16Uc", "nc", "simd128")
+
+TARGET_BUILTIN(__builtin_wasm_extmul_low_i16x8_s_i32x4, "V4iV8sV8s", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_extmul_high_i16x8_s_i32x4, "V4iV8sV8s", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_extmul_low_i16x8_u_i32x4, "V4UiV8UsV8Us", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_extmul_high_i16x8_u_i32x4, "V4UiV8UsV8Us", "nc", "simd128")
+
+TARGET_BUILTIN(__builtin_wasm_extmul_low_i32x4_s_i64x2, "V2LLiV4iV4i", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_extmul_high_i32x4_s_i64x2, "V2LLiV4iV4i", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_extmul_low_i32x4_u_i64x2, "V2ULLiV4UiV4Ui", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_extmul_high_i32x4_u_i64x2, "V2ULLiV4UiV4Ui", "nc", "simd128")
+
 TARGET_BUILTIN(__builtin_wasm_bitselect, "V4iV4iV4iV4i", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_shuffle_v8x16, "V16ScV16ScV16ScIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIi", "nc", "simd128")
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ae3645df183d..e38964356542 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16600,6 +16600,49 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(Intrinsic::wasm_q15mulr_saturate_signed);
     return Builder.CreateCall(Callee, {LHS, RHS});
   }
+  case WebAssembly::BI__builtin_wasm_extmul_low_i8x16_s_i16x8:
+  case WebAssembly::BI__builtin_wasm_extmul_high_i8x16_s_i16x8:
+  case WebAssembly::BI__builtin_wasm_extmul_low_i8x16_u_i16x8:
+  case WebAssembly::BI__builtin_wasm_extmul_high_i8x16_u_i16x8:
+  case WebAssembly::BI__builtin_wasm_extmul_low_i16x8_s_i32x4:
+  case WebAssembly::BI__builtin_wasm_extmul_high_i16x8_s_i32x4:
+  case WebAssembly::BI__builtin_wasm_extmul_low_i16x8_u_i32x4:
+  case WebAssembly::BI__builtin_wasm_extmul_high_i16x8_u_i32x4:
+  case WebAssembly::BI__builtin_wasm_extmul_low_i32x4_s_i64x2:
+  case WebAssembly::BI__builtin_wasm_extmul_high_i32x4_s_i64x2:
+  case WebAssembly::BI__builtin_wasm_extmul_low_i32x4_u_i64x2:
+  case WebAssembly::BI__builtin_wasm_extmul_high_i32x4_u_i64x2: {
+    Value *LHS = EmitScalarExpr(E->getArg(0));
+    Value *RHS = EmitScalarExpr(E->getArg(1));
+    unsigned IntNo;
+    switch (BuiltinID) {
+    case WebAssembly::BI__builtin_wasm_extmul_low_i8x16_s_i16x8:
+    case WebAssembly::BI__builtin_wasm_extmul_low_i16x8_s_i32x4:
+    case WebAssembly::BI__builtin_wasm_extmul_low_i32x4_s_i64x2:
+      IntNo = Intrinsic::wasm_extmul_low_signed;
+      break;
+    case WebAssembly::BI__builtin_wasm_extmul_low_i8x16_u_i16x8:
+    case WebAssembly::BI__builtin_wasm_extmul_low_i16x8_u_i32x4:
+    case WebAssembly::BI__builtin_wasm_extmul_low_i32x4_u_i64x2:
+      IntNo = Intrinsic::wasm_extmul_low_unsigned;
+      break;
+    case WebAssembly::BI__builtin_wasm_extmul_high_i8x16_s_i16x8:
+    case WebAssembly::BI__builtin_wasm_extmul_high_i16x8_s_i32x4:
+    case WebAssembly::BI__builtin_wasm_extmul_high_i32x4_s_i64x2:
+      IntNo = Intrinsic::wasm_extmul_high_signed;
+      break;
+    case WebAssembly::BI__builtin_wasm_extmul_high_i8x16_u_i16x8:
+    case WebAssembly::BI__builtin_wasm_extmul_high_i16x8_u_i32x4:
+    case WebAssembly::BI__builtin_wasm_extmul_high_i32x4_u_i64x2:
+      IntNo = Intrinsic::wasm_extmul_high_unsigned;
+      break;
+    default:
+      llvm_unreachable("unexptected builtin ID");
+    }
+
+    Function *Callee = CGM.getIntrinsic(IntNo, ConvertType(E->getType()));
+    return Builder.CreateCall(Callee, {LHS, RHS});
+  }
   case WebAssembly::BI__builtin_wasm_bitselect: {
     Value *V1 = EmitScalarExpr(E->getArg(0));
     Value *V2 = EmitScalarExpr(E->getArg(1));
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index 4b882700e3ac..be61e9e4c86e 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -525,6 +525,90 @@ i16x8 q15mulr_saturate_s_i16x8(i16x8 x, i16x8 y) {
   // WEBASSEMBLY-NEXT: ret
 }
 
+i16x8 extmul_low_i8x16_s_i16x8(i8x16 x, i8x16 y) {
+  return __builtin_wasm_extmul_low_i8x16_s_i16x8(x, y);
+  // WEBASSEMBLY: call <8 x i16> @llvm.wasm.extmul.low.signed.v8i16(
+  // WEBASSEMBLY-SAME: <16 x i8> %x, <16 x i8> %y)
+  // WEBASSEMBLY-NEXT: ret
+}
+
+i16x8 extmul_high_i8x16_s_i16x8(i8x16 x, i8x16 y) {
+  return __builtin_wasm_extmul_high_i8x16_s_i16x8(x, y);
+  // WEBASSEMBLY: call <8 x i16> @llvm.wasm.extmul.high.signed.v8i16(
+  // WEBASSEMBLY-SAME: <16 x i8> %x, <16 x i8> %y)
+  // WEBASSEMBLY-NEXT: ret
+}
+
+u16x8 extmul_low_i8x16_u_i16x8(u8x16 x, u8x16 y) {
+  return __builtin_wasm_extmul_low_i8x16_u_i16x8(x, y);
+  // WEBASSEMBLY: call <8 x i16> @llvm.wasm.extmul.low.unsigned.v8i16(
+  // WEBASSEMBLY-SAME: <16 x i8> %x, <16 x i8> %y)
+  // WEBASSEMBLY-NEXT: ret
+}
+
+u16x8 extmul_high_i8x16_u_i16x8(u8x16 x, u8x16 y) {
+  return __builtin_wasm_extmul_high_i8x16_u_i16x8(x, y);
+  // WEBASSEMBLY: call <8 x i16> @llvm.wasm.extmul.high.unsigned.v8i16(
+  // WEBASSEMBLY-SAME: <16 x i8> %x, <16 x i8> %y)
+  // WEBASSEMBLY-NEXT: ret
+}
+
+i32x4 extmul_low_i16x8_s_i32x4(i16x8 x, i16x8 y) {
+  return __builtin_wasm_extmul_low_i16x8_s_i32x4(x, y);
+  // WEBASSEMBLY: call <4 x i32> @llvm.wasm.extmul.low.signed.v4i32(
+  // WEBASSEMBLY-SAME: <8 x i16> %x, <8 x i16> %y)
+  // WEBASSEMBLY-NEXT: ret
+}
+
+i32x4 extmul_high_i16x8_s_i32x4(i16x8 x, i16x8 y) {
+  return __builtin_wasm_extmul_high_i16x8_s_i32x4(x, y);
+  // WEBASSEMBLY: call <4 x i32> @llvm.wasm.extmul.high.signed.v4i32(
+  // WEBASSEMBLY-SAME: <8 x i16> %x, <8 x i16> %y)
+  // WEBASSEMBLY-NEXT: ret
+}
+
+u32x4 extmul_low_i16x8_u_i32x4(u16x8 x, u16x8 y) {
+  return __builtin_wasm_extmul_low_i16x8_u_i32x4(x, y);
+  // WEBASSEMBLY: call <4 x i32> @llvm.wasm.extmul.low.unsigned.v4i32(
+  // WEBASSEMBLY-SAME: <8 x i16> %x, <8 x i16> %y)
+  // WEBASSEMBLY-NEXT: ret
+}
+
+u32x4 extmul_high_i16x8_u_i32x4(u16x8 x, u16x8 y) {
+  return __builtin_wasm_extmul_high_i16x8_u_i32x4(x, y);
+  // WEBASSEMBLY: call <4 x i32> @llvm.wasm.extmul.high.unsigned.v4i32(
+  // WEBASSEMBLY-SAME: <8 x i16> %x, <8 x i16> %y)
+  // WEBASSEMBLY-NEXT: ret
+}
+
+i64x2 extmul_low_i32x4_s_i64x2(i32x4 x, i32x4 y) {
+  return __builtin_wasm_extmul_low_i32x4_s_i64x2(x, y);
+  // WEBASSEMBLY: call <2 x i64> @llvm.wasm.extmul.low.signed.v2i64(
+  // WEBASSEMBLY-SAME: <4 x i32> %x, <4 x i32> %y)
+  // WEBASSEMBLY-NEXT: ret
+}
+
+i64x2 extmul_high_i32x4_s_i64x2(i32x4 x, i32x4 y) {
+  return __builtin_wasm_extmul_high_i32x4_s_i64x2(x, y);
+  // WEBASSEMBLY: call <2 x i64> @llvm.wasm.extmul.high.signed.v2i64(
+  // WEBASSEMBLY-SAME: <4 x i32> %x, <4 x i32> %y)
+  // WEBASSEMBLY-NEXT: ret
+}
+
+u64x2 extmul_low_i32x4_u_i64x2(u32x4 x, u32x4 y) {
+  return __builtin_wasm_extmul_low_i32x4_u_i64x2(x, y);
+  // WEBASSEMBLY: call <2 x i64> @llvm.wasm.extmul.low.unsigned.v2i64(
+  // WEBASSEMBLY-SAME: <4 x i32> %x, <4 x i32> %y)
+  // WEBASSEMBLY-NEXT: ret
+}
+
+u64x2 extmul_high_i32x4_u_i64x2(u32x4 x, u32x4 y) {
+  return __builtin_wasm_extmul_high_i32x4_u_i64x2(x, y);
+  // WEBASSEMBLY: call <2 x i64> @llvm.wasm.extmul.high.unsigned.v2i64(
+  // WEBASSEMBLY-SAME: <4 x i32> %x, <4 x i32> %y)
+  // WEBASSEMBLY-NEXT: ret
+}
+
 i32x4 dot_i16x8_s(i16x8 x, i16x8 y) {
   return __builtin_wasm_dot_s_i32x4_i16x8(x, y);
   // WEBASSEMBLY: call <4 x i32> @llvm.wasm.dot(<8 x i16> %x, <8 x i16> %y)
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index 7a701ec0e269..5c503a4d6436 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -259,6 +259,23 @@ def int_wasm_store64_lane :
 def int_wasm_popcnt :
   Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem, IntrSpeculatable]>;
 
+def int_wasm_extmul_low_signed :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_extmul_high_signed :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_extmul_low_unsigned :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_extmul_high_unsigned :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+
 //===----------------------------------------------------------------------===//
 // Thread-local storage intrinsics
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index 529674f66914..3e8cce488490 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -62,12 +62,16 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
   uint64_t Start = OS.tell();
 
   uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
-  if (Binary <= UINT8_MAX) {
+  if (Binary < (1 << 8)) {
     OS << uint8_t(Binary);
-  } else {
-    assert(Binary <= UINT16_MAX && "Several-byte opcodes not supported yet");
+  } else if (Binary < (1 << 16)) {
     OS << uint8_t(Binary >> 8);
     encodeULEB128(uint8_t(Binary), OS);
+  } else if (Binary < (1 << 24)) {
+    OS << uint8_t(Binary >> 16);
+    encodeULEB128(uint16_t(Binary), OS);
+  } else {
+    llvm_unreachable("Very large (prefix + 3 byte) opcodes not supported");
   }
 
   // For br_table instructions, encode the size of the table. In the MCInst,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 5d72def798fd..7e78b6c0eecd 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -16,7 +16,9 @@ multiclass SIMD_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                   list<dag> pattern_r, string asmstr_r = "",
                   string asmstr_s = "", bits<32> simdop = -1> {
   defm "" : I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r, asmstr_s,
-              !or(0xfd00, !and(0xff, simdop))>,
+              !if(!ge(simdop, 0x100),
+                  !or(0xfd0000, !and(0xffff, simdop)),
+                  !or(0xfd00, !and(0xff, simdop)))>,
             Requires<[HasSIMD128]>;
 }
 
@@ -935,6 +937,57 @@ defm DOT : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins),
                   "i32x4.dot_i16x8_s\t$dst, $lhs, $rhs", "i32x4.dot_i16x8_s",
                   186>;
 
+// Extending multiplication: extmul_{low,high}_P, extmul_high
+multiclass SIMDExtBinary<ValueType vec_t, ValueType arg_t, string vec,
+                         SDNode node, string name, bits<32> simdop> {
+  defm _#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                        (outs), (ins),
+                        [(set (vec_t V128:$dst),
+                          (node (arg_t V128:$lhs), (arg_t V128:$rhs))
+                        )],
+                        vec#"."#name#"\t$dst, $lhs, $rhs", vec#"."#name,
+                        simdop>;
+}
+
+defm EXTMUL_LOW_S :
+  SIMDExtBinary<v8i16, v16i8, "i16x8", int_wasm_extmul_low_signed,
+                "extmul_low_i8x16_s", 154>;
+defm EXTMUL_HIGH_S :
+  SIMDExtBinary<v8i16, v16i8, "i16x8", int_wasm_extmul_high_signed,
+                "extmul_high_i8x16_s", 157>;
+defm EXTMUL_LOW_U :
+  SIMDExtBinary<v8i16, v16i8, "i16x8", int_wasm_extmul_low_unsigned,
+                "extmul_low_i8x16_u", 158>;
+defm EXTMUL_HIGH_U :
+  SIMDExtBinary<v8i16, v16i8, "i16x8", int_wasm_extmul_high_unsigned,
+                "extmul_high_i8x16_u", 159>;
+
+defm EXTMUL_LOW_S :
+  SIMDExtBinary<v4i32, v8i16, "i32x4", int_wasm_extmul_low_signed,
+                "extmul_low_i16x8_s", 187>;
+defm EXTMUL_HIGH_S :
+  SIMDExtBinary<v4i32, v8i16, "i32x4", int_wasm_extmul_high_signed,
+                "extmul_high_i16x8_s", 189>;
+defm EXTMUL_LOW_U :
+  SIMDExtBinary<v4i32, v8i16, "i32x4", int_wasm_extmul_low_unsigned,
+                "extmul_low_i16x8_u", 190>;
+defm EXTMUL_HIGH_U :
+  SIMDExtBinary<v4i32, v8i16, "i32x4", int_wasm_extmul_high_unsigned,
+                "extmul_high_i16x8_u", 191>;
+
+defm EXTMUL_LOW_S :
+  SIMDExtBinary<v2i64, v4i32, "i64x2", int_wasm_extmul_low_signed,
+                "extmul_low_i32x4_s", 210>;
+defm EXTMUL_HIGH_S :
+  SIMDExtBinary<v2i64, v4i32, "i64x2", int_wasm_extmul_high_signed,
+                "extmul_high_i32x4_s", 211>;
+defm EXTMUL_LOW_U :
+  SIMDExtBinary<v2i64, v4i32, "i64x2", int_wasm_extmul_low_unsigned,
+               "extmul_low_i32x4_u", 214>;
+defm EXTMUL_HIGH_U :
+  SIMDExtBinary<v2i64, v4i32, "i64x2", int_wasm_extmul_high_unsigned,
+                "extmul_high_i32x4_u", 215>;
+
 //===----------------------------------------------------------------------===//
 // Floating-point unary arithmetic
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
index 7c0f43d2c67f..b4edbd3daee2 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
@@ -249,6 +249,54 @@ define <8 x i16> @q15mulr_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i16> %a
 }
 
+; CHECK-LABEL: extmul_low_s_v8i16:
+; SIMD128-NEXT: .functype extmul_low_s_v8i16 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: i16x8.extmul_low_i8x16_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <8 x i16> @llvm.wasm.extmul.low.signed.v8i16(<16 x i8>, <16 x i8>)
+define <8 x i16> @extmul_low_s_v8i16(<16 x i8> %x, <16 x i8> %y) {
+  %a = call <8 x i16> @llvm.wasm.extmul.low.signed.v8i16(
+    <16 x i8> %x, <16 x i8> %y
+  )
+  ret <8 x i16> %a
+}
+
+; CHECK-LABEL: extmul_high_s_v8i16:
+; SIMD128-NEXT: .functype extmul_high_s_v8i16 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: i16x8.extmul_high_i8x16_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <8 x i16> @llvm.wasm.extmul.high.signed.v8i16(<16 x i8>, <16 x i8>)
+define <8 x i16> @extmul_high_s_v8i16(<16 x i8> %x, <16 x i8> %y) {
+  %a = call <8 x i16> @llvm.wasm.extmul.high.signed.v8i16(
+    <16 x i8> %x, <16 x i8> %y
+  )
+  ret <8 x i16> %a
+}
+
+; CHECK-LABEL: extmul_low_u_v8i16:
+; SIMD128-NEXT: .functype extmul_low_u_v8i16 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <8 x i16> @llvm.wasm.extmul.low.unsigned.v8i16(<16 x i8>, <16 x i8>)
+define <8 x i16> @extmul_low_u_v8i16(<16 x i8> %x, <16 x i8> %y) {
+  %a = call <8 x i16> @llvm.wasm.extmul.low.unsigned.v8i16(
+    <16 x i8> %x, <16 x i8> %y
+  )
+  ret <8 x i16> %a
+}
+
+; CHECK-LABEL: extmul_high_u_v8i16:
+; SIMD128-NEXT: .functype extmul_high_u_v8i16 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <8 x i16> @llvm.wasm.extmul.high.unsigned.v8i16(<16 x i8>, <16 x i8>)
+define <8 x i16> @extmul_high_u_v8i16(<16 x i8> %x, <16 x i8> %y) {
+  %a = call <8 x i16> @llvm.wasm.extmul.high.unsigned.v8i16(
+    <16 x i8> %x, <16 x i8> %y
+  )
+  ret <8 x i16> %a
+}
+
 ; CHECK-LABEL: any_v8i16:
 ; SIMD128-NEXT: .functype any_v8i16 (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i16x8.any_true $push[[R:[0-9]+]]=, $0{{$}}
@@ -328,6 +376,55 @@ define <4 x i32> @dot(<8 x i16> %x, <8 x i16> %y) {
   ret <4 x i32> %a
 }
 
+
+; CHECK-LABEL: extmul_low_s_v4i32:
+; SIMD128-NEXT: .functype extmul_low_s_v4i32 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: i32x4.extmul_low_i16x8_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <4 x i32> @llvm.wasm.extmul.low.signed.v4i32(<8 x i16>, <8 x i16>)
+define <4 x i32> @extmul_low_s_v4i32(<8 x i16> %x, <8 x i16> %y) {
+  %a = call <4 x i32> @llvm.wasm.extmul.low.signed.v4i32(
+    <8 x i16> %x, <8 x i16> %y
+  )
+  ret <4 x i32> %a
+}
+
+; CHECK-LABEL: extmul_high_s_v4i32:
+; SIMD128-NEXT: .functype extmul_high_s_v4i32 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: i32x4.extmul_high_i16x8_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <4 x i32> @llvm.wasm.extmul.high.signed.v4i32(<8 x i16>, <8 x i16>)
+define <4 x i32> @extmul_high_s_v4i32(<8 x i16> %x, <8 x i16> %y) {
+  %a = call <4 x i32> @llvm.wasm.extmul.high.signed.v4i32(
+    <8 x i16> %x, <8 x i16> %y
+  )
+  ret <4 x i32> %a
+}
+
+; CHECK-LABEL: extmul_low_u_v4i32:
+; SIMD128-NEXT: .functype extmul_low_u_v4i32 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: i32x4.extmul_low_i16x8_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <4 x i32> @llvm.wasm.extmul.low.unsigned.v4i32(<8 x i16>, <8 x i16>)
+define <4 x i32> @extmul_low_u_v4i32(<8 x i16> %x, <8 x i16> %y) {
+  %a = call <4 x i32> @llvm.wasm.extmul.low.unsigned.v4i32(
+    <8 x i16> %x, <8 x i16> %y
+  )
+  ret <4 x i32> %a
+}
+
+; CHECK-LABEL: extmul_high_u_v4i32:
+; SIMD128-NEXT: .functype extmul_high_u_v4i32 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: i32x4.extmul_high_i16x8_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <4 x i32> @llvm.wasm.extmul.high.unsigned.v4i32(<8 x i16>, <8 x i16>)
+define <4 x i32> @extmul_high_u_v4i32(<8 x i16> %x, <8 x i16> %y) {
+  %a = call <4 x i32> @llvm.wasm.extmul.high.unsigned.v4i32(
+    <8 x i16> %x, <8 x i16> %y
+  )
+  ret <4 x i32> %a
+}
+
 ; CHECK-LABEL: any_v4i32:
 ; SIMD128-NEXT: .functype any_v4i32 (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i32x4.any_true $push[[R:[0-9]+]]=, $0{{$}}
@@ -395,6 +492,54 @@ define <4 x i32> @trunc_sat_u_v4i32(<4 x float> %x) {
 ; ==============================================================================
 ; 2 x i64
 ; ==============================================================================
+; CHECK-LABEL: extmul_low_s_v2i64:
+; SIMD128-NEXT: .functype extmul_low_s_v2i64 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: i64x2.extmul_low_i32x4_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <2 x i64> @llvm.wasm.extmul.low.signed.v2i64(<4 x i32>, <4 x i32>)
+define <2 x i64> @extmul_low_s_v2i64(<4 x i32> %x, <4 x i32> %y) {
+  %a = call <2 x i64> @llvm.wasm.extmul.low.signed.v2i64(
+    <4 x i32> %x, <4 x i32> %y
+  )
+  ret <2 x i64> %a
+}
+
+; CHECK-LABEL: extmul_high_s_v2i64:
+; SIMD128-NEXT: .functype extmul_high_s_v2i64 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: i64x2.extmul_high_i32x4_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <2 x i64> @llvm.wasm.extmul.high.signed.v2i64(<4 x i32>, <4 x i32>)
+define <2 x i64> @extmul_high_s_v2i64(<4 x i32> %x, <4 x i32> %y) {
+  %a = call <2 x i64> @llvm.wasm.extmul.high.signed.v2i64(
+    <4 x i32> %x, <4 x i32> %y
+  )
+  ret <2 x i64> %a
+}
+
+; CHECK-LABEL: extmul_low_u_v2i64:
+; SIMD128-NEXT: .functype extmul_low_u_v2i64 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: i64x2.extmul_low_i32x4_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <2 x i64> @llvm.wasm.extmul.low.unsigned.v2i64(<4 x i32>, <4 x i32>)
+define <2 x i64> @extmul_low_u_v2i64(<4 x i32> %x, <4 x i32> %y) {
+  %a = call <2 x i64> @llvm.wasm.extmul.low.unsigned.v2i64(
+    <4 x i32> %x, <4 x i32> %y
+  )
+  ret <2 x i64> %a
+}
+
+; CHECK-LABEL: extmul_high_u_v2i64:
+; SIMD128-NEXT: .functype extmul_high_u_v2i64 (v128, v128) -> (v128){{$}}
+; SIMD128-NEXT: i64x2.extmul_high_i32x4_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <2 x i64> @llvm.wasm.extmul.high.unsigned.v2i64(<4 x i32>, <4 x i32>)
+define <2 x i64> @extmul_high_u_v2i64(<4 x i32> %x, <4 x i32> %y) {
+  %a = call <2 x i64> @llvm.wasm.extmul.high.unsigned.v2i64(
+    <4 x i32> %x, <4 x i32> %y
+  )
+  ret <2 x i64> %a
+}
+
 ; CHECK-LABEL: any_v2i64:
 ; SIMD128-NEXT: .functype any_v2i64 (v128) -> (i32){{$}}
 ; SIMD128-NEXT: i64x2.any_true $push[[R:[0-9]+]]=, $0{{$}}
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index 2b737b434f61..a4908a0a61af 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -658,4 +658,40 @@ main:
     # CHECK: f64x2.qfms # encoding: [0xfd,0xff,0x01]
     f64x2.qfms
 
+    # CHECK: i16x8.extmul_low_i8x16_s # encoding: [0xfd,0x9a,0x01]
+    i16x8.extmul_low_i8x16_s
+
+    # CHECK: i16x8.extmul_high_i8x16_s # encoding: [0xfd,0x9d,0x01]
+    i16x8.extmul_high_i8x16_s
+
+    # CHECK: i16x8.extmul_low_i8x16_u # encoding: [0xfd,0x9e,0x01]
+    i16x8.extmul_low_i8x16_u
+
+    # CHECK: i16x8.extmul_high_i8x16_u # encoding: [0xfd,0x9f,0x01]
+    i16x8.extmul_high_i8x16_u
+
+    # CHECK: i32x4.extmul_low_i16x8_s # encoding: [0xfd,0xbb,0x01]
+    i32x4.extmul_low_i16x8_s
+
+    # CHECK: i32x4.extmul_high_i16x8_s # encoding: [0xfd,0xbd,0x01]
+    i32x4.extmul_high_i16x8_s
+
+    # CHECK: i32x4.extmul_low_i16x8_u # encoding: [0xfd,0xbe,0x01]
+    i32x4.extmul_low_i16x8_u
+
+    # CHECK: i32x4.extmul_high_i16x8_u # encoding: [0xfd,0xbf,0x01]
+    i32x4.extmul_high_i16x8_u
+
+    # CHECK: i64x2.extmul_low_i32x4_s # encoding: [0xfd,0xd2,0x01]
+    i64x2.extmul_low_i32x4_s
+
+    # CHECK: i64x2.extmul_high_i32x4_s # encoding: [0xfd,0xd3,0x01]
+    i64x2.extmul_high_i32x4_s
+
+    # CHECK: i64x2.extmul_low_i32x4_u # encoding: [0xfd,0xd6,0x01]
+    i64x2.extmul_low_i32x4_u
+
+    # CHECK: i64x2.extmul_high_i32x4_u # encoding: [0xfd,0xd7,0x01]
+    i64x2.extmul_high_i32x4_u
+
     end_function
diff --git a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
index 54aa5a8164f2..037e852a452d 100644
--- a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
+++ b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
@@ -39,9 +39,15 @@ void emitWebAssemblyDisassemblerTables(
             ->getValue());
     if (Opc == 0xFFFFFFFF)
       continue; // No opcode defined.
-    assert(Opc <= 0xFFFF);
-    auto Prefix = Opc >> 8;
-    Opc = Opc & 0xFF;
+    assert(Opc <= 0xFFFFFF);
+    unsigned Prefix;
+    if (Opc <= 0xFFFF) {
+      Prefix = Opc >> 8;
+      Opc = Opc & 0xFF;
+    } else {
+      Prefix = Opc >> 16;
+      Opc = Opc & 0xFFFF;
+    }
     auto &CGIP = OpcodeTable[Prefix][Opc];
     // All wasm instructions have a StackBased field of type string, we only
     // want the instructions for which this is "true".