From dcab6cbae957398a2c4b00d1df6daabbe012e9ea Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Wed, 18 Sep 2019 14:44:06 -0700
Subject: [PATCH] Add x86 encoding for SIMD imul

Only i16x8 and i32x4 are encoded in this commit mainly because i8x16 and i64x2 do not have simple encodings in x86. i64x2 is not required by the SIMD spec and there is discussion (https://github.com/WebAssembly/simd/pull/98#issuecomment-530092217) about removing i8x16.
---
 .../meta/src/isa/x86/encodings.rs             | 13 ++++++
 .../meta/src/shared/instructions.rs           |  3 +-
 cranelift-wasm/src/code_translator.rs         |  6 ++-
 filetests/isa/x86/simd-arithmetic.clif        | 44 +++++++++++++++++++
 4 files changed, 62 insertions(+), 4 deletions(-)
diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs
index 8b5ce47e9..a4fa729ed 100644
--- a/cranelift-codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift-codegen/meta/src/isa/x86/encodings.rs
@@ -2074,6 +2074,19 @@ pub(crate) fn define(
         e.enc_32_64(isub, rec_fa.opcodes(opcodes));
     }
 
+    // SIMD integer multiplication: the x86 ISA does not have instructions for multiplying I8x16
+    // and I64x2 and these are (at the time of writing) not necessary for WASM SIMD.
+    for (ty, opcodes, isap) in [
+        (I16, vec![0x66, 0x0f, 0xd5], None), // PMULLW from SSE2
+        (I32, vec![0x66, 0x0f, 0x38, 0x40], Some(use_sse41_simd)), // PMULLD from SSE4.1
+    ]
+    .iter()
+    .cloned()
+    {
+        let imul = imul.bind_vector_from_lane(ty, sse_vector_size);
+        e.enc_32_64_maybe_isap(imul, rec_fa.opcodes(opcodes), isap);
+    }
+
     // SIMD icmp using PCMPEQ*
     let mut pcmpeq_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
         HashMap::new();
diff --git a/cranelift-codegen/meta/src/shared/instructions.rs b/cranelift-codegen/meta/src/shared/instructions.rs
index 8dbb00f3e..8ea17036f 100644
--- a/cranelift-codegen/meta/src/shared/instructions.rs
+++ b/cranelift-codegen/meta/src/shared/instructions.rs
@@ -1719,8 +1719,7 @@ pub(crate) fn define(
         Wrapping integer multiplication: `a := x y \pmod{2^B}`.
 
         This instruction does not depend on the signed/unsigned interpretation
-        of the
-        operands.
+        of the operands.
 
         Polymorphic over all integer types (vector and scalar).
         "#,
diff --git a/cranelift-wasm/src/code_translator.rs b/cranelift-wasm/src/code_translator.rs
index ef5861920..420cd5af2 100644
--- a/cranelift-wasm/src/code_translator.rs
+++ b/cranelift-wasm/src/code_translator.rs
@@ -998,6 +998,10 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let a = state.pop1();
             state.push1(builder.ins().ineg(a))
         }
+        Operator::I16x8Mul | Operator::I32x4Mul => {
+            let (a, b) = state.pop2();
+            state.push1(builder.ins().imul(a, b))
+        }
         Operator::V128Load { .. }
         | Operator::V128Store { .. }
         | Operator::I8x16Eq
@@ -1066,13 +1070,11 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         | Operator::I16x8AddSaturateU
         | Operator::I16x8SubSaturateS
         | Operator::I16x8SubSaturateU
-        | Operator::I16x8Mul
         | Operator::I32x4AnyTrue
         | Operator::I32x4AllTrue
         | Operator::I32x4Shl
         | Operator::I32x4ShrS
         | Operator::I32x4ShrU
-        | Operator::I32x4Mul
         | Operator::I64x2AnyTrue
         | Operator::I64x2AllTrue
         | Operator::I64x2Shl
diff --git a/filetests/isa/x86/simd-arithmetic.clif b/filetests/isa/x86/simd-arithmetic.clif
index 824417772..e2714a91d 100644
--- a/filetests/isa/x86/simd-arithmetic.clif
+++ b/filetests/isa/x86/simd-arithmetic.clif
@@ -120,3 +120,47 @@ ebb0:
 
     return ; bin: c3
 }
+
+function %imul_i32x4() -> b1 {
+ebb0:
+[-, %xmm0]    v0 = vconst.i32x4 [-1 0 1 -2147483647] ; e.g. -2147483647 == 0x80_00_00_01
+[-, %xmm1]    v1 = vconst.i32x4 [2 2 2 2]
+[-, %xmm0]    v2 = imul v0, v1 ; bin: 66 0f 38 40 c1
+
+    v3 = extractlane v2, 0
+    v4 = icmp_imm eq v3, -2
+
+    v5 = extractlane v2, 1
+    v6 = icmp_imm eq v5, 0
+
+    v7 = extractlane v2, 3
+    v8 = icmp_imm eq v7, 2 ; 0x80_00_00_01 * 2 == 0x1_00_00_00_02 (and the 1 is dropped)
+
+    v9 = band v4, v6
+    v10 = band v8, v9
+    return v10
+}
+; run
+
+function %imul_i16x8() -> b1 {
+ebb0:
+[-, %xmm1]    v0 = vconst.i16x8 [-1 0 1 32767 0 0 0 0] ; e.g. 32767 == 0x7f_ff
+[-, %xmm2]    v1 = vconst.i16x8 [2 2 2 2 0 0 0 0]
+[-, %xmm1]    v2 = imul v0, v1 ; bin: 66 0f d5 ca
+
+    v3 = extractlane v2, 0
+    v4 = icmp_imm eq v3, 0xfffe ; TODO -2 will not work here and below because v3 is being
+    ; uextend-ed, not sextend-ed
+
+    v5 = extractlane v2, 1
+    v6 = icmp_imm eq v5, 0
+
+    v7 = extractlane v2, 3
+    v8 = icmp_imm eq v7, 0xfffe ; 0x7f_ff * 2 == 0xff_fe
+
+    v9 = band v4, v6
+    v10 = band v8, v9
+
+    return v4
+}
+; run