Add x86 encoding for SIMD imul

Only i16x8 and i32x4 are encoded in this commit mainly because i8x16 and i64x2 do not have simple encodings in x86. i64x2 is not required by the SIMD spec and there is discussion (WebAssembly/simd#98 (comment)) about removing i8x16.
abrown · Sep 18, 2019 · 2c9dad7 · 2c9dad7
1 parent c33cb52
commit 2c9dad7
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 4 deletions.
diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs
@@ -2012,6 +2012,20 @@ pub(crate) fn define(
         e.enc_32_64(isub, rec_fa.opcodes(opcodes));
     }
 
+    // SIMD integer multiplication: the x86 ISA does not have instructions for multiplying I8x16
+    // and I64x2. The necessary encodings are avoided for now (TODO) until we finalize whether they
+    // are essential to the Wasm SIMD spec (I64x2 has already been removed).
+    for (ty, opcodes, isap) in [
+        (I16, vec![0x66, 0x0f, 0xd5], None), // PMULLW from SSE2
+        (I32, vec![0x66, 0x0f, 0x38, 0x40], Some(use_sse41_simd)), // PMULLD from SSE4.1
+    ]
+    .iter()
+    .cloned()
+    {
+        let imul = imul.bind_vector_from_lane(ty, sse_vector_size);
+        e.enc_32_64_maybe_isap(imul, rec_fa.opcodes(opcodes), isap);
+    }
+
     // Reference type instructions
 
     // Null references implemented as iconst 0.

diff --git a/cranelift-codegen/meta/src/shared/instructions.rs b/cranelift-codegen/meta/src/shared/instructions.rs
@@ -1682,8 +1682,7 @@ pub(crate) fn define(
         Wrapping integer multiplication: `a := x y \pmod{2^B}`.
 
         This instruction does not depend on the signed/unsigned interpretation
-        of the
-        operands.
+        of the operands.
 
         Polymorphic over all integer types (vector and scalar).
         "#,

diff --git a/cranelift-wasm/src/code_translator.rs b/cranelift-wasm/src/code_translator.rs
@@ -986,6 +986,10 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let a = state.pop1();
             state.push1(builder.ins().ineg(a))
         }
+        Operator::I16x8Mul | Operator::I32x4Mul => {
+            let (a, b) = state.pop2();
+            state.push1(builder.ins().imul(a, b))
+        }
         Operator::V128Load { .. }
         | Operator::V128Store { .. }
         | Operator::V8x16Shuffle { .. }
@@ -1055,13 +1059,11 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         | Operator::I16x8AddSaturateU
         | Operator::I16x8SubSaturateS
         | Operator::I16x8SubSaturateU
-        | Operator::I16x8Mul
         | Operator::I32x4AnyTrue
         | Operator::I32x4AllTrue
         | Operator::I32x4Shl
         | Operator::I32x4ShrS
         | Operator::I32x4ShrU
-        | Operator::I32x4Mul
         | Operator::I64x2AnyTrue
         | Operator::I64x2AllTrue
         | Operator::I64x2Shl

diff --git a/filetests/isa/x86/simd-arithmetic.clif b/filetests/isa/x86/simd-arithmetic.clif
@@ -120,3 +120,47 @@ ebb0:
 
     return ; bin: c3
 }
+
+function %imul_i32x4() -> b1 {
+ebb0:
+[-, %xmm0]    v0 = vconst.i32x4 [-1 0 1 -2147483647] ; e.g. -2147483647 == 0x80_00_00_01
+[-, %xmm1]    v1 = vconst.i32x4 [2 2 2 2]
+[-, %xmm0]    v2 = imul v0, v1 ; bin: 66 0f 38 40 c1
+
+    v3 = extractlane v2, 0
+    v4 = icmp_imm eq v3, -2
+
+    v5 = extractlane v2, 1
+    v6 = icmp_imm eq v5, 0
+
+    v7 = extractlane v2, 3
+    v8 = icmp_imm eq v7, 2 ; 0x80_00_00_01 * 2 == 0x1_00_00_00_02 (and the 1 is dropped)
+
+    v9 = band v4, v6
+    v10 = band v8, v9
+    return v10
+}
+; run
+
+function %imul_i16x8() -> b1 {
+ebb0:
+[-, %xmm1]    v0 = vconst.i16x8 [-1 0 1 32767 0 0 0 0] ; e.g. 32767 == 0x7f_ff
+[-, %xmm2]    v1 = vconst.i16x8 [2 2 2 2 0 0 0 0]
+[-, %xmm1]    v2 = imul v0, v1 ; bin: 66 0f d5 ca
+
+    v3 = extractlane v2, 0
+    v4 = icmp_imm eq v3, 0xfffe ; TODO -2 will not work here and below because v3 is being
+    ; uextend-ed, not sextend-ed
+
+    v5 = extractlane v2, 1
+    v6 = icmp_imm eq v5, 0
+
+    v7 = extractlane v2, 3
+    v8 = icmp_imm eq v7, 0xfffe ; 0x7f_ff * 2 == 0xff_fe
+
+    v9 = band v4, v6
+    v10 = band v8, v9
+
+    return v4
+}
+; run