Add x86 encoding for SIMD imul

Only i16x8 and i32x4 are encoded in this commit mainly because i8x16 and i64x2 do not have simple encodings in x86. i64x2 is not required by the SIMD spec and there is discussion (WebAssembly/simd#98 (comment)) about removing i8x16.
abrown · Sep 25, 2019 · dcab6cb · dcab6cb
1 parent 757eac4
commit dcab6cb
Show file tree

Hide file tree

Showing 4 changed files with 62 additions and 4 deletions.
diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs
@@ -2074,6 +2074,19 @@ pub(crate) fn define(
         e.enc_32_64(isub, rec_fa.opcodes(opcodes));
     }
 
+    // SIMD integer multiplication: the x86 ISA does not have instructions for multiplying I8x16
+    // and I64x2 and these are (at the time of writing) not necessary for WASM SIMD.
+    for (ty, opcodes, isap) in [
+        (I16, vec![0x66, 0x0f, 0xd5], None), // PMULLW from SSE2
+        (I32, vec![0x66, 0x0f, 0x38, 0x40], Some(use_sse41_simd)), // PMULLD from SSE4.1
+    ]
+    .iter()
+    .cloned()
+    {
+        let imul = imul.bind_vector_from_lane(ty, sse_vector_size);
+        e.enc_32_64_maybe_isap(imul, rec_fa.opcodes(opcodes), isap);
+    }
+
     // SIMD icmp using PCMPEQ*
     let mut pcmpeq_mapping: HashMap<u64, (Vec<u8>, Option<SettingPredicateNumber>)> =
         HashMap::new();

diff --git a/cranelift-codegen/meta/src/shared/instructions.rs b/cranelift-codegen/meta/src/shared/instructions.rs
@@ -1719,8 +1719,7 @@ pub(crate) fn define(
         Wrapping integer multiplication: `a := x y \pmod{2^B}`.
 
         This instruction does not depend on the signed/unsigned interpretation
-        of the
-        operands.
+        of the operands.
 
         Polymorphic over all integer types (vector and scalar).
         "#,

diff --git a/cranelift-wasm/src/code_translator.rs b/cranelift-wasm/src/code_translator.rs
@@ -998,6 +998,10 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let a = state.pop1();
             state.push1(builder.ins().ineg(a))
         }
+        Operator::I16x8Mul | Operator::I32x4Mul => {
+            let (a, b) = state.pop2();
+            state.push1(builder.ins().imul(a, b))
+        }
         Operator::V128Load { .. }
         | Operator::V128Store { .. }
         | Operator::I8x16Eq
@@ -1066,13 +1070,11 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         | Operator::I16x8AddSaturateU
         | Operator::I16x8SubSaturateS
         | Operator::I16x8SubSaturateU
-        | Operator::I16x8Mul
         | Operator::I32x4AnyTrue
         | Operator::I32x4AllTrue
         | Operator::I32x4Shl
         | Operator::I32x4ShrS
         | Operator::I32x4ShrU
-        | Operator::I32x4Mul
         | Operator::I64x2AnyTrue
         | Operator::I64x2AllTrue
         | Operator::I64x2Shl

diff --git a/filetests/isa/x86/simd-arithmetic.clif b/filetests/isa/x86/simd-arithmetic.clif
@@ -120,3 +120,47 @@ ebb0:
 
     return ; bin: c3
 }
+
+function %imul_i32x4() -> b1 {
+ebb0:
+[-, %xmm0]    v0 = vconst.i32x4 [-1 0 1 -2147483647] ; e.g. -2147483647 == 0x80_00_00_01
+[-, %xmm1]    v1 = vconst.i32x4 [2 2 2 2]
+[-, %xmm0]    v2 = imul v0, v1 ; bin: 66 0f 38 40 c1
+
+    v3 = extractlane v2, 0
+    v4 = icmp_imm eq v3, -2
+
+    v5 = extractlane v2, 1
+    v6 = icmp_imm eq v5, 0
+
+    v7 = extractlane v2, 3
+    v8 = icmp_imm eq v7, 2 ; 0x80_00_00_01 * 2 == 0x1_00_00_00_02 (and the 1 is dropped)
+
+    v9 = band v4, v6
+    v10 = band v8, v9
+    return v10
+}
+; run
+
+function %imul_i16x8() -> b1 {
+ebb0:
+[-, %xmm1]    v0 = vconst.i16x8 [-1 0 1 32767 0 0 0 0] ; e.g. 32767 == 0x7f_ff
+[-, %xmm2]    v1 = vconst.i16x8 [2 2 2 2 0 0 0 0]
+[-, %xmm1]    v2 = imul v0, v1 ; bin: 66 0f d5 ca
+
+    v3 = extractlane v2, 0
+    v4 = icmp_imm eq v3, 0xfffe ; TODO -2 will not work here and below because v3 is being
+    ; uextend-ed, not sextend-ed
+
+    v5 = extractlane v2, 1
+    v6 = icmp_imm eq v5, 0
+
+    v7 = extractlane v2, 3
+    v8 = icmp_imm eq v7, 0xfffe ; 0x7f_ff * 2 == 0xff_fe
+
+    v9 = band v4, v6
+    v10 = band v8, v9
+
+    return v4
+}
+; run