diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs index de78c3b3b7f9..681b3104d552 100644 --- a/cranelift/codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs @@ -396,6 +396,7 @@ fn define_simd( let insertlane = insts.by_name("insertlane"); let ishl = insts.by_name("ishl"); let ishl_imm = insts.by_name("ishl_imm"); + let load_splat = insts.by_name("load_splat"); let raw_bitcast = insts.by_name("raw_bitcast"); let scalar_to_vector = insts.by_name("scalar_to_vector"); let splat = insts.by_name("splat"); @@ -820,6 +821,7 @@ fn define_simd( narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector"); narrow.custom_legalize(fmin, "expand_minmax_vector"); narrow.custom_legalize(fmax, "expand_minmax_vector"); + narrow.custom_legalize(load_splat, "expand_load_splat"); narrow_avx.custom_legalize(imul, "convert_i64x2_imul"); narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector"); diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index 2c16734f2766..9cb77493c705 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -4409,5 +4409,24 @@ pub(crate) fn define( .other_side_effects(true), ); + let Offset = &Operand::new("Offset", &imm.offset32).with_doc("Byte offset from base address"); + let a = &Operand::new("a", TxN); + + ig.push( + Inst::new( + "load_splat", + r#" + Load an element from memory at ``p + Offset`` and return a vector + whose lanes are all set to that element. + + This is equivalent to ``load`` followed by ``splat``. + "#, + &formats.load, + ) + .operands_in(vec![MemFlags, p, Offset]) + .operands_out(vec![a]) + .can_load(true), + ); + ig.build() } diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs index f85c1028ff2e..95bf4bb63fb8 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/args.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs @@ -680,4 +680,19 @@ impl VectorSize { _ => *self, } } + + /// Return the encoding bits that are used by some SIMD instructions + /// for a particular operand size. + pub fn enc_size(&self) -> (u32, u32) { + let q = self.is_128bits() as u32; + let size = match self.lane_size() { + ScalarSize::Size8 => 0b00, + ScalarSize::Size16 => 0b01, + ScalarSize::Size32 => 0b10, + ScalarSize::Size64 => 0b11, + _ => unreachable!(), + }; + + (q, size) + } } diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index d422fdc24f36..124fd36c87e7 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -248,6 +248,16 @@ fn enc_ldst_imm19(op_31_24: u32, imm19: u32, rd: Reg) -> u32 { (op_31_24 << 24) | (imm19 << 5) | machreg_to_gpr_or_vec(rd) } +fn enc_ldst_vec(q: u32, size: u32, rn: Reg, rt: Writable) -> u32 { + debug_assert_eq!(q & 0b1, q); + debug_assert_eq!(size & 0b11, size); + 0b0_0_0011010_10_00000_110_0_00_00000_00000 + | q << 30 + | size << 10 + | machreg_to_gpr(rn) << 5 + | machreg_to_vec(rt.to_reg()) +} + fn enc_extend(top22: u32, rd: Writable, rn: Reg) -> u32 { (top22 << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg()) } @@ -1380,14 +1390,7 @@ impl MachInstEmit for Inst { sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra)); } &Inst::VecMisc { op, rd, rn, size } => { - let enc_size = match size.lane_size() { - ScalarSize::Size8 => 0b00, - ScalarSize::Size16 => 0b01, - ScalarSize::Size32 => 0b10, - ScalarSize::Size64 => 0b11, - _ => unreachable!(), - }; - let q = if size.is_128bits() { 1 } else { 0 }; + let (q, enc_size) = size.enc_size(); let (u, bits_12_16, size) = match op { VecMisc2::Not => (0b1, 0b00101, 0b00), VecMisc2::Neg => (0b1, 0b01011, enc_size), @@ -1756,13 +1759,7 @@ impl MachInstEmit for Inst { alu_op, size, } => { - let enc_size = match size.lane_size() { - ScalarSize::Size8 => 0b00, - ScalarSize::Size16 => 0b01, - ScalarSize::Size32 => 0b10, - ScalarSize::Size64 => 0b11, - _ => unreachable!(), - }; + let (q, enc_size) = size.enc_size(); let is_float = match alu_op { VecALUOp::Fcmeq | VecALUOp::Fcmgt @@ -1776,6 +1773,7 @@ impl MachInstEmit for Inst { _ => false, }; let enc_float_size = match (is_float, size) { + (true, VectorSize::Size32x2) => 0b0, (true, VectorSize::Size32x4) => 0b0, (true, VectorSize::Size64x2) => 0b1, (true, _) => unimplemented!(), @@ -1783,58 +1781,73 @@ impl MachInstEmit for Inst { }; let (top11, bit15_10) = match alu_op { - VecALUOp::Sqadd => (0b010_01110_00_1 | enc_size << 1, 0b000011), - VecALUOp::Sqsub => (0b010_01110_00_1 | enc_size << 1, 0b001011), - VecALUOp::Uqadd => (0b011_01110_00_1 | enc_size << 1, 0b000011), - VecALUOp::Uqsub => (0b011_01110_00_1 | enc_size << 1, 0b001011), - VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011), - VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111), - VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101), - VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size << 1, 0b001101), - VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size << 1, 0b001111), - VecALUOp::Fcmeq => (0b010_01110_00_1, 0b111001), - VecALUOp::Fcmgt => (0b011_01110_10_1, 0b111001), - VecALUOp::Fcmge => (0b011_01110_00_1, 0b111001), + VecALUOp::Sqadd => (0b000_01110_00_1 | enc_size << 1, 0b000011), + VecALUOp::Sqsub => (0b000_01110_00_1 | enc_size << 1, 0b001011), + VecALUOp::Uqadd => (0b001_01110_00_1 | enc_size << 1, 0b000011), + VecALUOp::Uqsub => (0b001_01110_00_1 | enc_size << 1, 0b001011), + VecALUOp::Cmeq => (0b001_01110_00_1 | enc_size << 1, 0b100011), + VecALUOp::Cmge => (0b000_01110_00_1 | enc_size << 1, 0b001111), + VecALUOp::Cmgt => (0b000_01110_00_1 | enc_size << 1, 0b001101), + VecALUOp::Cmhi => (0b001_01110_00_1 | enc_size << 1, 0b001101), + VecALUOp::Cmhs => (0b001_01110_00_1 | enc_size << 1, 0b001111), + VecALUOp::Fcmeq => (0b000_01110_00_1, 0b111001), + VecALUOp::Fcmgt => (0b001_01110_10_1, 0b111001), + VecALUOp::Fcmge => (0b001_01110_00_1, 0b111001), // The following logical instructions operate on bytes, so are not encoded differently // for the different vector types. - VecALUOp::And => (0b010_01110_00_1, 0b000111), - VecALUOp::Bic => (0b010_01110_01_1, 0b000111), - VecALUOp::Orr => (0b010_01110_10_1, 0b000111), - VecALUOp::Eor => (0b011_01110_00_1, 0b000111), - VecALUOp::Bsl => (0b011_01110_01_1, 0b000111), - VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001), - VecALUOp::Add => (0b010_01110_00_1 | enc_size << 1, 0b100001), - VecALUOp::Sub => (0b011_01110_00_1 | enc_size << 1, 0b100001), + VecALUOp::And => (0b000_01110_00_1, 0b000111), + VecALUOp::Bic => (0b000_01110_01_1, 0b000111), + VecALUOp::Orr => (0b000_01110_10_1, 0b000111), + VecALUOp::Eor => (0b001_01110_00_1, 0b000111), + VecALUOp::Bsl => (0b001_01110_01_1, 0b000111), + VecALUOp::Umaxp => (0b001_01110_00_1 | enc_size << 1, 0b101001), + VecALUOp::Add => (0b000_01110_00_1 | enc_size << 1, 0b100001), + VecALUOp::Sub => (0b001_01110_00_1 | enc_size << 1, 0b100001), VecALUOp::Mul => { debug_assert_ne!(size, VectorSize::Size64x2); - (0b010_01110_00_1 | enc_size << 1, 0b100111) + (0b000_01110_00_1 | enc_size << 1, 0b100111) } - VecALUOp::Sshl => (0b010_01110_00_1 | enc_size << 1, 0b010001), - VecALUOp::Ushl => (0b011_01110_00_1 | enc_size << 1, 0b010001), - VecALUOp::Umin => (0b011_01110_00_1 | enc_size << 1, 0b011011), - VecALUOp::Smin => (0b010_01110_00_1 | enc_size << 1, 0b011011), - VecALUOp::Umax => (0b011_01110_00_1 | enc_size << 1, 0b011001), - VecALUOp::Smax => (0b010_01110_00_1 | enc_size << 1, 0b011001), - VecALUOp::Urhadd => (0b011_01110_00_1 | enc_size << 1, 0b000101), - VecALUOp::Fadd => (0b010_01110_00_1, 0b110101), - VecALUOp::Fsub => (0b010_01110_10_1, 0b110101), - VecALUOp::Fdiv => (0b011_01110_00_1, 0b111111), - VecALUOp::Fmax => (0b010_01110_00_1, 0b111101), - VecALUOp::Fmin => (0b010_01110_10_1, 0b111101), - VecALUOp::Fmul => (0b011_01110_00_1, 0b110111), - VecALUOp::Addp => (0b010_01110_00_1 | enc_size << 1, 0b101111), + VecALUOp::Sshl => (0b000_01110_00_1 | enc_size << 1, 0b010001), + VecALUOp::Ushl => (0b001_01110_00_1 | enc_size << 1, 0b010001), + VecALUOp::Umin => (0b001_01110_00_1 | enc_size << 1, 0b011011), + VecALUOp::Smin => (0b000_01110_00_1 | enc_size << 1, 0b011011), + VecALUOp::Umax => (0b001_01110_00_1 | enc_size << 1, 0b011001), + VecALUOp::Smax => (0b000_01110_00_1 | enc_size << 1, 0b011001), + VecALUOp::Urhadd => (0b001_01110_00_1 | enc_size << 1, 0b000101), + VecALUOp::Fadd => (0b000_01110_00_1, 0b110101), + VecALUOp::Fsub => (0b000_01110_10_1, 0b110101), + VecALUOp::Fdiv => (0b001_01110_00_1, 0b111111), + VecALUOp::Fmax => (0b000_01110_00_1, 0b111101), + VecALUOp::Fmin => (0b000_01110_10_1, 0b111101), + VecALUOp::Fmul => (0b001_01110_00_1, 0b110111), + VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111), VecALUOp::Umlal => { debug_assert!(!size.is_128bits()); (0b001_01110_00_1 | enc_size << 1, 0b100000) } }; let top11 = if is_float { - top11 | enc_float_size << 1 + top11 | (q << 9) | enc_float_size << 1 } else { - top11 + top11 | (q << 9) }; sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd)); } + &Inst::VecLoadReplicate { + rd, + rn, + size, + srcloc, + } => { + let (q, size) = size.enc_size(); + + if let Some(srcloc) = srcloc { + // Register the offset at which the actual load instruction starts. + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } + + sink.put4(enc_ldst_vec(q, size, rn, rd)); + } &Inst::MovToNZCV { rn } => { sink.put4(0xd51b4200 | machreg_to_gpr(rn)); } @@ -2119,9 +2132,12 @@ impl MachInstEmit for Inst { inst.emit(sink, emit_info, state); } - let (reg, offset) = match mem { - AMode::Unscaled(r, simm9) => (r, simm9.value()), - AMode::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32), + let (reg, index_reg, offset) = match mem { + AMode::RegExtended(r, idx, extendop) => (r, Some((idx, extendop)), 0), + AMode::Unscaled(r, simm9) => (r, None, simm9.value()), + AMode::UnsignedOffset(r, uimm12scaled) => { + (r, None, uimm12scaled.value() as i32) + } _ => panic!("Unsupported case for LoadAddr: {:?}", mem), }; let abs_offset = if offset < 0 { @@ -2135,9 +2151,22 @@ impl MachInstEmit for Inst { ALUOp::Add64 }; - if offset == 0 { - let mov = Inst::mov(rd, reg); - mov.emit(sink, emit_info, state); + if let Some((idx, extendop)) = index_reg { + let add = Inst::AluRRRExtend { + alu_op: ALUOp::Add64, + rd, + rn: reg, + rm: idx, + extendop, + }; + + add.emit(sink, emit_info, state); + } else if offset == 0 { + if reg != rd.to_reg() { + let mov = Inst::mov(rd, reg); + + mov.emit(sink, emit_info, state); + } } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) { let add = Inst::AluRRImm12 { alu_op, diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 6d981c2eaa93..48707610ffb6 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -2533,10 +2533,10 @@ fn test_aarch64_binemit() { rd: writable_vreg(28), rn: vreg(12), rm: vreg(4), - size: VectorSize::Size32x4, + size: VectorSize::Size32x2, }, - "9CE5244E", - "fcmeq v28.4s, v12.4s, v4.4s", + "9CE5240E", + "fcmeq v28.2s, v12.2s, v4.2s", )); insns.push(( @@ -2965,10 +2965,10 @@ fn test_aarch64_binemit() { rd: writable_vreg(6), rn: vreg(9), rm: vreg(8), - size: VectorSize::Size8x16, + size: VectorSize::Size8x8, }, - "2665286E", - "umax v6.16b, v9.16b, v8.16b", + "2665282E", + "umax v6.8b, v9.8b, v8.8b", )); insns.push(( @@ -3507,6 +3507,28 @@ fn test_aarch64_binemit() { "tbx v3.16b, { v11.16b, v12.16b }, v19.16b", )); + insns.push(( + Inst::VecLoadReplicate { + rd: writable_vreg(31), + rn: xreg(0), + srcloc: None, + size: VectorSize::Size64x2, + }, + "1FCC404D", + "ld1r { v31.2d }, [x0]", + )); + + insns.push(( + Inst::VecLoadReplicate { + rd: writable_vreg(0), + rn: xreg(25), + srcloc: None, + size: VectorSize::Size8x8, + }, + "20C3400D", + "ld1r { v0.8b }, [x25]", + )); + insns.push(( Inst::Extend { rd: writable_xreg(1), diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 544d04c23cec..e9c0f15425a2 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -975,6 +975,14 @@ pub enum Inst { is_extension: bool, }, + /// Load an element and replicate to all lanes of a vector. + VecLoadReplicate { + rd: Writable, + rn: Reg, + size: VectorSize, + srcloc: Option, + }, + /// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn). MovToNZCV { rn: Reg, @@ -1609,7 +1617,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_def(rd); } } - + &Inst::VecLoadReplicate { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } &Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => { collector.add_use(rn); collector.add_use(rm); @@ -1762,8 +1773,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { &Inst::LoadExtName { rd, .. } => { collector.add_def(rd); } - &Inst::LoadAddr { rd, mem: _ } => { + &Inst::LoadAddr { rd, ref mem } => { collector.add_def(rd); + memarg_regs(mem, collector); } &Inst::VirtualSPOffsetAdj { .. } => {} &Inst::EmitIsland { .. } => {} @@ -2189,6 +2201,14 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { map_def(mapper, rd); } } + &mut Inst::VecLoadReplicate { + ref mut rd, + ref mut rn, + .. + } => { + map_def(mapper, rd); + map_use(mapper, rn); + } &mut Inst::FpuCmp32 { ref mut rn, ref mut rm, @@ -3412,6 +3432,12 @@ impl Inst { let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16); format!("{} {}, {{ {}, {} }}, {}", op, rd, rn, rn2, rm) } + &Inst::VecLoadReplicate { rd, rn, size, .. } => { + let rd = show_vreg_vector(rd.to_reg(), mb_rru, size); + let rn = rn.show_rru(mb_rru); + + format!("ld1r {{ {} }}, [{}]", rd, rn) + } &Inst::MovToNZCV { rn } => { let rn = rn.show_rru(mb_rru); format!("msr nzcv, {}", rn) diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index fc28cb35818e..ecdcb9c6d1d2 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1197,6 +1197,29 @@ pub(crate) fn lower_insn_to_regs>( } } + Opcode::LoadSplat => { + let off = ctx.data(insn).load_store_offset().unwrap(); + let ty = ty.unwrap(); + let mem = lower_address(ctx, ty.lane_type(), &inputs[..], off); + let memflags = ctx.memflags(insn).expect("memory flags"); + let rd = get_output_reg(ctx, outputs[0]); + let size = VectorSize::from_ty(ty); + let srcloc = if memflags.notrap() { + None + } else { + Some(ctx.srcloc(insn)) + }; + let tmp = ctx.alloc_tmp(RegClass::I64, I64); + + ctx.emit(Inst::LoadAddr { rd: tmp, mem }); + ctx.emit(Inst::VecLoadReplicate { + rd, + rn: tmp.to_reg(), + size, + srcloc, + }); + } + Opcode::Store | Opcode::Istore8 | Opcode::Istore16 diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 688e620d8302..397b21f69d3c 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1728,6 +1728,7 @@ pub(crate) fn emit( op, src: src_e, dst: reg_g, + srcloc, } => { let rex = RexFlags::clear_w(); let (prefix, opcode, length) = match op { @@ -1819,6 +1820,10 @@ pub(crate) fn emit( emit_std_reg_reg(sink, prefix, opcode, length, reg_g.to_reg(), *reg_e, rex); } RegMem::Mem { addr } => { + if let Some(srcloc) = *srcloc { + // Register the offset at which the actual load instruction starts. + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } let addr = &addr.finalize(state); emit_std_reg_mem(sink, prefix, opcode, length, reg_g.to_reg(), addr, rex); } @@ -1889,7 +1894,7 @@ pub(crate) fn emit( // and negative zero. These instructions merge the sign bits in that // case, and are no-ops otherwise. let op = if *is_min { or_op } else { and_op }; - let inst = Inst::xmm_rm_r(op, RegMem::reg(*lhs), *rhs_dst); + let inst = Inst::xmm_rm_r(op, RegMem::reg(*lhs), *rhs_dst, None); inst.emit(sink, info, state); let inst = Inst::jmp_known(done); @@ -1899,13 +1904,13 @@ pub(crate) fn emit( // read-only operand: perform an addition between the two operands, which has the // desired NaN propagation effects. sink.bind_label(propagate_nan); - let inst = Inst::xmm_rm_r(add_op, RegMem::reg(*lhs), *rhs_dst); + let inst = Inst::xmm_rm_r(add_op, RegMem::reg(*lhs), *rhs_dst, None); inst.emit(sink, info, state); one_way_jmp(sink, CC::P, done); sink.bind_label(do_min_max); - let inst = Inst::xmm_rm_r(min_max_op, RegMem::reg(*lhs), *rhs_dst); + let inst = Inst::xmm_rm_r(min_max_op, RegMem::reg(*lhs), *rhs_dst, None); inst.emit(sink, info, state); sink.bind_label(done); @@ -1916,7 +1921,8 @@ pub(crate) fn emit( src, dst, imm, - is64: w, + is64, + srcloc, } => { let (prefix, opcode, len) = match op { SseOpcode::Cmpps => (LegacyPrefixes::None, 0x0FC2, 2), @@ -1933,7 +1939,7 @@ pub(crate) fn emit( SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2), _ => unimplemented!("Opcode {:?} not implemented", op), }; - let rex = if *w { + let rex = if *is64 { RexFlags::set_w() } else { RexFlags::clear_w() @@ -1955,6 +1961,10 @@ pub(crate) fn emit( } } RegMem::Mem { addr } => { + if let Some(srcloc) = *srcloc { + // Register the offset at which the actual load instruction starts. + sink.add_trap(srcloc, TrapCode::HeapOutOfBounds); + } let addr = &addr.finalize(state); assert!( !regs_swapped, @@ -1963,7 +1973,7 @@ pub(crate) fn emit( emit_std_reg_mem(sink, prefix, opcode, len, dst.to_reg(), addr, rex); } } - sink.put1(*imm) + sink.put1(*imm); } Inst::XmmLoadConstSeq { val, dst, ty } => { @@ -2188,7 +2198,7 @@ pub(crate) fn emit( } else { SseOpcode::Addss }; - let inst = Inst::xmm_rm_r(add_op, RegMem::reg(dst.to_reg()), *dst); + let inst = Inst::xmm_rm_r(add_op, RegMem::reg(dst.to_reg()), *dst, None); inst.emit(sink, info, state); sink.bind_label(done); @@ -2295,8 +2305,12 @@ pub(crate) fn emit( // If the input was positive, saturate to INT_MAX. // Zero out tmp_xmm. - let inst = - Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), *tmp_xmm); + let inst = Inst::xmm_rm_r( + SseOpcode::Xorpd, + RegMem::reg(tmp_xmm.to_reg()), + *tmp_xmm, + None, + ); inst.emit(sink, info, state); let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg()); @@ -2367,8 +2381,12 @@ pub(crate) fn emit( sink.bind_label(check_positive); // Zero out the tmp_xmm register. - let inst = - Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), *tmp_xmm); + let inst = Inst::xmm_rm_r( + SseOpcode::Xorpd, + RegMem::reg(tmp_xmm.to_reg()), + *tmp_xmm, + None, + ); inst.emit(sink, info, state); let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg()); @@ -2522,7 +2540,7 @@ pub(crate) fn emit( sink.bind_label(handle_large); - let inst = Inst::xmm_rm_r(sub_op, RegMem::reg(tmp_xmm.to_reg()), *src); + let inst = Inst::xmm_rm_r(sub_op, RegMem::reg(tmp_xmm.to_reg()), *src, None); inst.emit(sink, info, state); let inst = Inst::xmm_to_gpr(trunc_op, src.to_reg(), *dst, *dst_size); diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 62992be2bd49..71120a101df1 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -2983,12 +2983,12 @@ fn test_x64_emit() { // XMM_RM_R: float binary ops insns.push(( - Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm1), w_xmm0), + Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm1), w_xmm0, None), "F30F58C1", "addss %xmm1, %xmm0", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm11), w_xmm13), + Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm11), w_xmm13, None), "F3450F58EB", "addss %xmm11, %xmm13", )); @@ -2997,23 +2997,24 @@ fn test_x64_emit() { SseOpcode::Addss, RegMem::mem(Amode::imm_reg_reg_shift(123, r10, rdx, 2)), w_xmm0, + None, ), "F3410F5844927B", "addss 123(%r10,%rdx,4), %xmm0", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Addsd, RegMem::reg(xmm15), w_xmm4), + Inst::xmm_rm_r(SseOpcode::Addsd, RegMem::reg(xmm15), w_xmm4, None), "F2410F58E7", "addsd %xmm15, %xmm4", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm0), w_xmm1), + Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm0), w_xmm1, None), "F30F5CC8", "subss %xmm0, %xmm1", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm12), w_xmm1), + Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm12), w_xmm1, None), "F3410F5CCC", "subss %xmm12, %xmm1", )); @@ -3022,57 +3023,58 @@ fn test_x64_emit() { SseOpcode::Subss, RegMem::mem(Amode::imm_reg_reg_shift(321, r10, rax, 3)), w_xmm10, + None, ), "F3450F5C94C241010000", "subss 321(%r10,%rax,8), %xmm10", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Subsd, RegMem::reg(xmm5), w_xmm14), + Inst::xmm_rm_r(SseOpcode::Subsd, RegMem::reg(xmm5), w_xmm14, None), "F2440F5CF5", "subsd %xmm5, %xmm14", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Mulss, RegMem::reg(xmm5), w_xmm4), + Inst::xmm_rm_r(SseOpcode::Mulss, RegMem::reg(xmm5), w_xmm4, None), "F30F59E5", "mulss %xmm5, %xmm4", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Mulsd, RegMem::reg(xmm5), w_xmm4), + Inst::xmm_rm_r(SseOpcode::Mulsd, RegMem::reg(xmm5), w_xmm4, None), "F20F59E5", "mulsd %xmm5, %xmm4", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Divss, RegMem::reg(xmm8), w_xmm7), + Inst::xmm_rm_r(SseOpcode::Divss, RegMem::reg(xmm8), w_xmm7, None), "F3410F5EF8", "divss %xmm8, %xmm7", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Divsd, RegMem::reg(xmm5), w_xmm4), + Inst::xmm_rm_r(SseOpcode::Divsd, RegMem::reg(xmm5), w_xmm4, None), "F20F5EE5", "divsd %xmm5, %xmm4", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Andps, RegMem::reg(xmm3), w_xmm12), + Inst::xmm_rm_r(SseOpcode::Andps, RegMem::reg(xmm3), w_xmm12, None), "440F54E3", "andps %xmm3, %xmm12", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Andnps, RegMem::reg(xmm4), w_xmm11), + Inst::xmm_rm_r(SseOpcode::Andnps, RegMem::reg(xmm4), w_xmm11, None), "440F55DC", "andnps %xmm4, %xmm11", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm1), w_xmm15), + Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm1), w_xmm15, None), "440F56F9", "orps %xmm1, %xmm15", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm5), w_xmm4), + Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm5), w_xmm4, None), "0F56E5", "orps %xmm5, %xmm4", )); @@ -3081,211 +3083,211 @@ fn test_x64_emit() { // XMM_RM_R: Integer Packed insns.push(( - Inst::xmm_rm_r(SseOpcode::Paddb, RegMem::reg(xmm9), w_xmm5), + Inst::xmm_rm_r(SseOpcode::Paddb, RegMem::reg(xmm9), w_xmm5, None), "66410FFCE9", "paddb %xmm9, %xmm5", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Paddw, RegMem::reg(xmm7), w_xmm6), + Inst::xmm_rm_r(SseOpcode::Paddw, RegMem::reg(xmm7), w_xmm6, None), "660FFDF7", "paddw %xmm7, %xmm6", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::reg(xmm12), w_xmm13), + Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::reg(xmm12), w_xmm13, None), "66450FFEEC", "paddd %xmm12, %xmm13", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Paddq, RegMem::reg(xmm1), w_xmm8), + Inst::xmm_rm_r(SseOpcode::Paddq, RegMem::reg(xmm1), w_xmm8, None), "66440FD4C1", "paddq %xmm1, %xmm8", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Paddsb, RegMem::reg(xmm9), w_xmm5), + Inst::xmm_rm_r(SseOpcode::Paddsb, RegMem::reg(xmm9), w_xmm5, None), "66410FECE9", "paddsb %xmm9, %xmm5", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Paddsw, RegMem::reg(xmm7), w_xmm6), + Inst::xmm_rm_r(SseOpcode::Paddsw, RegMem::reg(xmm7), w_xmm6, None), "660FEDF7", "paddsw %xmm7, %xmm6", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Paddusb, RegMem::reg(xmm12), w_xmm13), + Inst::xmm_rm_r(SseOpcode::Paddusb, RegMem::reg(xmm12), w_xmm13, None), "66450FDCEC", "paddusb %xmm12, %xmm13", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Paddusw, RegMem::reg(xmm1), w_xmm8), + Inst::xmm_rm_r(SseOpcode::Paddusw, RegMem::reg(xmm1), w_xmm8, None), "66440FDDC1", "paddusw %xmm1, %xmm8", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Psubsb, RegMem::reg(xmm9), w_xmm5), + Inst::xmm_rm_r(SseOpcode::Psubsb, RegMem::reg(xmm9), w_xmm5, None), "66410FE8E9", "psubsb %xmm9, %xmm5", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Psubsw, RegMem::reg(xmm7), w_xmm6), + Inst::xmm_rm_r(SseOpcode::Psubsw, RegMem::reg(xmm7), w_xmm6, None), "660FE9F7", "psubsw %xmm7, %xmm6", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Psubusb, RegMem::reg(xmm12), w_xmm13), + Inst::xmm_rm_r(SseOpcode::Psubusb, RegMem::reg(xmm12), w_xmm13, None), "66450FD8EC", "psubusb %xmm12, %xmm13", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Psubusw, RegMem::reg(xmm1), w_xmm8), + Inst::xmm_rm_r(SseOpcode::Psubusw, RegMem::reg(xmm1), w_xmm8, None), "66440FD9C1", "psubusw %xmm1, %xmm8", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pavgb, RegMem::reg(xmm12), w_xmm13), + Inst::xmm_rm_r(SseOpcode::Pavgb, RegMem::reg(xmm12), w_xmm13, None), "66450FE0EC", "pavgb %xmm12, %xmm13", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pavgw, RegMem::reg(xmm1), w_xmm8), + Inst::xmm_rm_r(SseOpcode::Pavgw, RegMem::reg(xmm1), w_xmm8, None), "66440FE3C1", "pavgw %xmm1, %xmm8", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Psubb, RegMem::reg(xmm5), w_xmm9), + Inst::xmm_rm_r(SseOpcode::Psubb, RegMem::reg(xmm5), w_xmm9, None), "66440FF8CD", "psubb %xmm5, %xmm9", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Psubw, RegMem::reg(xmm6), w_xmm7), + Inst::xmm_rm_r(SseOpcode::Psubw, RegMem::reg(xmm6), w_xmm7, None), "660FF9FE", "psubw %xmm6, %xmm7", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::reg(xmm13), w_xmm12), + Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::reg(xmm13), w_xmm12, None), "66450FFAE5", "psubd %xmm13, %xmm12", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Psubq, RegMem::reg(xmm8), w_xmm1), + Inst::xmm_rm_r(SseOpcode::Psubq, RegMem::reg(xmm8), w_xmm1, None), "66410FFBC8", "psubq %xmm8, %xmm1", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pmulld, RegMem::reg(xmm15), w_xmm6), + Inst::xmm_rm_r(SseOpcode::Pmulld, RegMem::reg(xmm15), w_xmm6, None), "66410F3840F7", "pmulld %xmm15, %xmm6", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(xmm14), w_xmm1), + Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(xmm14), w_xmm1, None), "66410FD5CE", "pmullw %xmm14, %xmm1", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9), + Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9, None), "66450FF4C8", "pmuludq %xmm8, %xmm9", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pmaxsb, RegMem::reg(xmm15), w_xmm6), + Inst::xmm_rm_r(SseOpcode::Pmaxsb, RegMem::reg(xmm15), w_xmm6, None), "66410F383CF7", "pmaxsb %xmm15, %xmm6", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pmaxsw, RegMem::reg(xmm15), w_xmm6), + Inst::xmm_rm_r(SseOpcode::Pmaxsw, RegMem::reg(xmm15), w_xmm6, None), "66410FEEF7", "pmaxsw %xmm15, %xmm6", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::reg(xmm15), w_xmm6), + Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::reg(xmm15), w_xmm6, None), "66410F383DF7", "pmaxsd %xmm15, %xmm6", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pmaxub, RegMem::reg(xmm14), w_xmm1), + Inst::xmm_rm_r(SseOpcode::Pmaxub, RegMem::reg(xmm14), w_xmm1, None), "66410FDECE", "pmaxub %xmm14, %xmm1", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pmaxuw, RegMem::reg(xmm14), w_xmm1), + Inst::xmm_rm_r(SseOpcode::Pmaxuw, RegMem::reg(xmm14), w_xmm1, None), "66410F383ECE", "pmaxuw %xmm14, %xmm1", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pmaxud, RegMem::reg(xmm14), w_xmm1), + Inst::xmm_rm_r(SseOpcode::Pmaxud, RegMem::reg(xmm14), w_xmm1, None), "66410F383FCE", "pmaxud %xmm14, %xmm1", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pminsb, RegMem::reg(xmm8), w_xmm9), + Inst::xmm_rm_r(SseOpcode::Pminsb, RegMem::reg(xmm8), w_xmm9, None), "66450F3838C8", "pminsb %xmm8, %xmm9", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pminsw, RegMem::reg(xmm8), w_xmm9), + Inst::xmm_rm_r(SseOpcode::Pminsw, RegMem::reg(xmm8), w_xmm9, None), "66450FEAC8", "pminsw %xmm8, %xmm9", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pminsd, RegMem::reg(xmm8), w_xmm9), + Inst::xmm_rm_r(SseOpcode::Pminsd, RegMem::reg(xmm8), w_xmm9, None), "66450F3839C8", "pminsd %xmm8, %xmm9", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pminub, RegMem::reg(xmm3), w_xmm2), + Inst::xmm_rm_r(SseOpcode::Pminub, RegMem::reg(xmm3), w_xmm2, None), "660FDAD3", "pminub %xmm3, %xmm2", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pminuw, RegMem::reg(xmm3), w_xmm2), + Inst::xmm_rm_r(SseOpcode::Pminuw, RegMem::reg(xmm3), w_xmm2, None), "660F383AD3", "pminuw %xmm3, %xmm2", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pminud, RegMem::reg(xmm3), w_xmm2), + Inst::xmm_rm_r(SseOpcode::Pminud, RegMem::reg(xmm3), w_xmm2, None), "660F383BD3", "pminud %xmm3, %xmm2", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::reg(xmm11), w_xmm2), + Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::reg(xmm11), w_xmm2, None), "66410FEFD3", "pxor %xmm11, %xmm2", )); insns.push(( - Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::reg(xmm11), w_xmm2), + Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::reg(xmm11), w_xmm2, None), "66410F3800D3", "pshufb %xmm11, %xmm2", )); @@ -3488,12 +3490,12 @@ fn test_x64_emit() { // ======================================================== // XmmRmRImm insns.push(( - Inst::xmm_rm_r_imm(SseOpcode::Cmppd, RegMem::reg(xmm5), w_xmm1, 2, false), + Inst::xmm_rm_r_imm(SseOpcode::Cmppd, RegMem::reg(xmm5), w_xmm1, 2, false, None), "660FC2CD02", "cmppd $2, %xmm5, %xmm1", )); insns.push(( - Inst::xmm_rm_r_imm(SseOpcode::Cmpps, RegMem::reg(xmm15), w_xmm7, 0, false), + Inst::xmm_rm_r_imm(SseOpcode::Cmpps, RegMem::reg(xmm15), w_xmm7, 0, false, None), "410FC2FF00", "cmpps $0, %xmm15, %xmm7", )); diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 1fe0de694183..aac925db626f 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -212,6 +212,7 @@ pub enum Inst { op: SseOpcode, src: RegMem, dst: Writable, + srcloc: Option, }, /// XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg, sqrt, @@ -338,6 +339,7 @@ pub enum Inst { dst: Writable, imm: u8, is64: bool, + srcloc: Option, }, // ===================================== @@ -711,10 +713,20 @@ impl Inst { } } - pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable) -> Self { + pub(crate) fn xmm_rm_r( + op: SseOpcode, + src: RegMem, + dst: Writable, + srcloc: Option, + ) -> Self { src.assert_regclass_is(RegClass::V128); debug_assert!(dst.to_reg().get_class() == RegClass::V128); - Inst::XmmRmR { op, src, dst } + Inst::XmmRmR { + op, + src, + dst, + srcloc, + } } pub(crate) fn xmm_uninit_value(dst: Writable) -> Self { @@ -869,6 +881,7 @@ impl Inst { dst: Writable, imm: u8, is64: bool, + srcloc: Option, ) -> Inst { Inst::XmmRmRImm { op, @@ -876,6 +889,7 @@ impl Inst { dst, imm, is64, + srcloc, } } @@ -1233,16 +1247,26 @@ impl Inst { /// Choose which instruction to use for comparing two values for equality. pub(crate) fn equals(ty: Type, from: RegMem, to: Writable) -> Inst { match ty { - types::I8X16 | types::B8X16 => Inst::xmm_rm_r(SseOpcode::Pcmpeqb, from, to), - types::I16X8 | types::B16X8 => Inst::xmm_rm_r(SseOpcode::Pcmpeqw, from, to), - types::I32X4 | types::B32X4 => Inst::xmm_rm_r(SseOpcode::Pcmpeqd, from, to), - types::I64X2 | types::B64X2 => Inst::xmm_rm_r(SseOpcode::Pcmpeqq, from, to), - types::F32X4 => { - Inst::xmm_rm_r_imm(SseOpcode::Cmpps, from, to, FcmpImm::Equal.encode(), false) - } - types::F64X2 => { - Inst::xmm_rm_r_imm(SseOpcode::Cmppd, from, to, FcmpImm::Equal.encode(), false) - } + types::I8X16 | types::B8X16 => Inst::xmm_rm_r(SseOpcode::Pcmpeqb, from, to, None), + types::I16X8 | types::B16X8 => Inst::xmm_rm_r(SseOpcode::Pcmpeqw, from, to, None), + types::I32X4 | types::B32X4 => Inst::xmm_rm_r(SseOpcode::Pcmpeqd, from, to, None), + types::I64X2 | types::B64X2 => Inst::xmm_rm_r(SseOpcode::Pcmpeqq, from, to, None), + types::F32X4 => Inst::xmm_rm_r_imm( + SseOpcode::Cmpps, + from, + to, + FcmpImm::Equal.encode(), + false, + None, + ), + types::F64X2 => Inst::xmm_rm_r_imm( + SseOpcode::Cmppd, + from, + to, + FcmpImm::Equal.encode(), + false, + None, + ), _ => unimplemented!("unimplemented type for Inst::equals: {}", ty), } } @@ -1250,9 +1274,11 @@ impl Inst { /// Choose which instruction to use for computing a bitwise AND on two values. pub(crate) fn and(ty: Type, from: RegMem, to: Writable) -> Inst { match ty { - types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andps, from, to), - types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andpd, from, to), - _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pand, from, to), + types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andps, from, to, None), + types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andpd, from, to, None), + _ if ty.is_vector() && ty.bits() == 128 => { + Inst::xmm_rm_r(SseOpcode::Pand, from, to, None) + } _ => unimplemented!("unimplemented type for Inst::and: {}", ty), } } @@ -1260,9 +1286,11 @@ impl Inst { /// Choose which instruction to use for computing a bitwise AND NOT on two values. pub(crate) fn and_not(ty: Type, from: RegMem, to: Writable) -> Inst { match ty { - types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andnps, from, to), - types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andnpd, from, to), - _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pandn, from, to), + types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andnps, from, to, None), + types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andnpd, from, to, None), + _ if ty.is_vector() && ty.bits() == 128 => { + Inst::xmm_rm_r(SseOpcode::Pandn, from, to, None) + } _ => unimplemented!("unimplemented type for Inst::and_not: {}", ty), } } @@ -1270,9 +1298,11 @@ impl Inst { /// Choose which instruction to use for computing a bitwise OR on two values. pub(crate) fn or(ty: Type, from: RegMem, to: Writable) -> Inst { match ty { - types::F32X4 => Inst::xmm_rm_r(SseOpcode::Orps, from, to), - types::F64X2 => Inst::xmm_rm_r(SseOpcode::Orpd, from, to), - _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Por, from, to), + types::F32X4 => Inst::xmm_rm_r(SseOpcode::Orps, from, to, None), + types::F64X2 => Inst::xmm_rm_r(SseOpcode::Orpd, from, to, None), + _ if ty.is_vector() && ty.bits() == 128 => { + Inst::xmm_rm_r(SseOpcode::Por, from, to, None) + } _ => unimplemented!("unimplemented type for Inst::or: {}", ty), } } @@ -1280,9 +1310,11 @@ impl Inst { /// Choose which instruction to use for computing a bitwise XOR on two values. pub(crate) fn xor(ty: Type, from: RegMem, to: Writable) -> Inst { match ty { - types::F32X4 => Inst::xmm_rm_r(SseOpcode::Xorps, from, to), - types::F64X2 => Inst::xmm_rm_r(SseOpcode::Xorpd, from, to), - _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pxor, from, to), + types::F32X4 => Inst::xmm_rm_r(SseOpcode::Xorps, from, to, None), + types::F64X2 => Inst::xmm_rm_r(SseOpcode::Xorpd, from, to, None), + _ if ty.is_vector() && ty.bits() == 128 => { + Inst::xmm_rm_r(SseOpcode::Pxor, from, to, None) + } _ => unimplemented!("unimplemented type for Inst::xor: {}", ty), } } @@ -1429,7 +1461,7 @@ impl PrettyPrint for Inst { dst.show_rru(mb_rru), ), - Inst::XmmRmR { op, src, dst } => format!( + Inst::XmmRmR { op, src, dst, .. } => format!( "{} {}, {}", ljustify(op.to_string()), src.show_rru_sized(mb_rru, 8), @@ -1459,7 +1491,7 @@ impl PrettyPrint for Inst { show_ireg_sized(rhs_dst.to_reg(), mb_rru, 8), ), - Inst::XmmRmRImm { op, src, dst, imm, is64 } => format!( + Inst::XmmRmRImm { op, src, dst, imm, is64, .. } => format!( "{} ${}, {}, {}", ljustify(format!("{}{}", op.to_string(), if *is64 { ".w" } else { "" })), imm, @@ -2595,6 +2627,7 @@ impl MachInst for Inst { SseOpcode::Xorps, RegMem::reg(to_reg.to_reg()), to_reg, + None, )); } else { let tmp = alloc_tmp(RegClass::I64, types::I32); @@ -2613,6 +2646,7 @@ impl MachInst for Inst { SseOpcode::Xorpd, RegMem::reg(to_reg.to_reg()), to_reg, + None, )); } else { let tmp = alloc_tmp(RegClass::I64, types::I64); diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 108072b97cce..614589d1603d 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -3,7 +3,7 @@ use crate::data_value::DataValue; use crate::ir::{ condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName, - Inst as IRInst, InstructionData, LibCall, Opcode, Signature, Type, + Inst as IRInst, InstructionData, LibCall, Opcode, Signature, SourceLoc, Type, }; use crate::isa::x64::abi::*; use crate::isa::x64::inst::args::*; @@ -227,6 +227,7 @@ fn emit_insert_lane>( dst: Writable, lane: u8, ty: Type, + srcloc: Option, ) { if !ty.is_float() { let (sse_op, is64) = match ty.lane_bits() { @@ -236,13 +237,13 @@ fn emit_insert_lane>( 64 => (SseOpcode::Pinsrd, true), _ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()), }; - ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64)); + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64, srcloc)); } else if ty == types::F32 { let sse_op = SseOpcode::Insertps; // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane // shifted into bits 5:6). let lane = 0b00_00_00_00 | lane << 4; - ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false)); + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false, srcloc)); } else if ty == types::F64 { let sse_op = match lane { // Move the lowest quadword in replacement to vector without changing @@ -256,7 +257,7 @@ fn emit_insert_lane>( // Here we use the `xmm_rm_r` encoding because it correctly tells the register // allocator how we are using `dst`: we are using `dst` as a `mod` whereas other // encoding formats like `xmm_unary_rm_r` treat it as a `def`. - ctx.emit(Inst::xmm_rm_r(sse_op, src, dst)); + ctx.emit(Inst::xmm_rm_r(sse_op, src, dst, srcloc)); } else { panic!("unable to emit insertlane for type: {}", ty) } @@ -694,6 +695,7 @@ fn lower_insn_to_regs>( SseOpcode::Pmuludq, RegMem::reg(lhs.clone()), rhs_1, + None, )); // B' = B @@ -707,7 +709,12 @@ fn lower_insn_to_regs>( RegMemImm::imm(32), lhs_1, )); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(rhs), lhs_1)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pmuludq, + RegMem::reg(rhs), + lhs_1, + None, + )); // B' = B' + A' // B' = B' << 32 @@ -715,6 +722,7 @@ fn lower_insn_to_regs>( SseOpcode::Paddq, RegMem::reg(rhs_1.to_reg()), lhs_1, + None, )); ctx.emit(Inst::xmm_rmi_reg( SseOpcode::Psllq, @@ -731,11 +739,13 @@ fn lower_insn_to_regs>( SseOpcode::Pmuludq, RegMem::reg(lhs.clone()), rhs_1, + None, )); ctx.emit(Inst::xmm_rm_r( SseOpcode::Paddq, RegMem::reg(lhs_1.to_reg()), rhs_1, + None, )); ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty)); return Ok(()); @@ -770,7 +780,7 @@ fn lower_insn_to_regs>( // Move the `lhs` to the same register as `dst`. ctx.emit(Inst::gen_move(dst, lhs, ty)); - ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst)); + ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst, None)); } else { let is_64 = ty == types::I64; let alu_op = match op { @@ -828,7 +838,7 @@ fn lower_insn_to_regs>( // Note the flipping of operands: the `rhs` operand is used as the destination instead // of the `lhs` as in the other bit operations above (e.g. `band`). ctx.emit(Inst::gen_move(dst, rhs, ty)); - ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst)); + ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst, None)); } Opcode::Iabs => { @@ -884,7 +894,7 @@ fn lower_insn_to_regs>( // Move the `lhs` to the same register as `dst`. ctx.emit(Inst::gen_move(dst, lhs, ty)); - ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst)); + ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst, None)); } else { panic!("Unsupported type for {} instruction: {}", op, ty); } @@ -1007,8 +1017,9 @@ fn lower_insn_to_regs>( SseOpcode::Pxor, RegMem::reg(tmp.to_reg()), tmp, + None, )); - ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp)); + ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp, None)); ctx.emit(Inst::xmm_unary_rm_r( SseOpcode::Movapd, RegMem::reg(tmp.to_reg()), @@ -1561,34 +1572,44 @@ fn lower_insn_to_regs>( }; match condcode { - IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)), + IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None)), IntCC::NotEqual => { - ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)); + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None)); // Emit all 1s into the `tmp` register. let tmp = ctx.alloc_tmp(RegClass::V128, ty); - ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp)); + ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp, None)); // Invert the result of the `PCMPEQ*`. - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pxor, + RegMem::from(tmp), + dst, + None, + )); } IntCC::SignedGreaterThan | IntCC::SignedLessThan => { - ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst)) + ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst, None)) } IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual => { - ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst)); - ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)) + ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst, None)); + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None)) } IntCC::UnsignedGreaterThan | IntCC::UnsignedLessThan => { - ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst)); - ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)); + ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst, None)); + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None)); // Emit all 1s into the `tmp` register. let tmp = ctx.alloc_tmp(RegClass::V128, ty); - ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp)); + ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp, None)); // Invert the result of the `PCMPEQ*`. - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pxor, + RegMem::from(tmp), + dst, + None, + )); } IntCC::UnsignedGreaterThanOrEqual | IntCC::UnsignedLessThanOrEqual => { - ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst)); - ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)) + ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst, None)); + ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None)) } _ => unimplemented!("Unimplemented comparison code for icmp: {}", condcode), } @@ -1686,7 +1707,7 @@ fn lower_insn_to_regs>( ctx.emit(Inst::gen_move(dst, lhs, input_ty)); // Emit the comparison. - ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode(), false)); + ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode(), false, None)); } } @@ -1899,7 +1920,7 @@ fn lower_insn_to_regs>( ty ), }; - ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst)); + ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst, None)); } Opcode::Fmin | Opcode::Fmax => { @@ -1988,15 +2009,15 @@ fn lower_insn_to_regs>( ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1, None)); // Perform min in reverse direction - ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1)); + ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1, None)); // Perform min in original direction - ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst)); + ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst, None)); // X64 handles propagation of -0's and Nans differently between left and right // operands. After doing the min in both directions, this OR will // guarrentee capture of -0's and Nan in our tmp register - ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1)); + ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1, None)); // Compare unordered to create mask for lanes containing NaNs and then use // that mask to saturate the NaN containing lanes in the tmp register with 1s. @@ -2009,8 +2030,14 @@ fn lower_insn_to_regs>( dst, cond.encode(), false, + None, + )); + ctx.emit(Inst::xmm_rm_r( + or_op, + RegMem::reg(dst.to_reg()), + tmp_xmm1, + None, )); - ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1)); // The dst register holds a mask for lanes containing NaNs. // We take that mask and shift in preparation for creating a different mask @@ -2022,7 +2049,12 @@ fn lower_insn_to_regs>( // Finally we do a nand with the tmp register to produce the final results // in the dst. - ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst)); + ctx.emit(Inst::xmm_rm_r( + andn_op, + RegMem::reg(tmp_xmm1.to_reg()), + dst, + None, + )); } else { let ( mov_op, @@ -2065,23 +2097,43 @@ fn lower_insn_to_regs>( ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1, None)); // Perform max in reverse direction. - ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(dst.to_reg()), tmp_xmm1)); + ctx.emit(Inst::xmm_rm_r( + max_op, + RegMem::reg(dst.to_reg()), + tmp_xmm1, + None, + )); // Perform max in original direction. - ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst)); + ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst, None)); // Get the difference between the two results and store in tmp. // Max uses a different approach than min to account for potential // discrepancies with plus/minus 0. - ctx.emit(Inst::xmm_rm_r(xor_op, RegMem::reg(tmp_xmm1.to_reg()), dst)); + ctx.emit(Inst::xmm_rm_r( + xor_op, + RegMem::reg(tmp_xmm1.to_reg()), + dst, + None, + )); // X64 handles propagation of -0's and Nans differently between left and right // operands. After doing the max in both directions, this OR will // guarentee capture of 0's and Nan in our tmp register. - ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1)); + ctx.emit(Inst::xmm_rm_r( + or_op, + RegMem::reg(dst.to_reg()), + tmp_xmm1, + None, + )); // Capture NaNs and sign discrepancies. - ctx.emit(Inst::xmm_rm_r(sub_op, RegMem::reg(dst.to_reg()), tmp_xmm1)); + ctx.emit(Inst::xmm_rm_r( + sub_op, + RegMem::reg(dst.to_reg()), + tmp_xmm1, + None, + )); // Compare unordered to create mask for lanes containing NaNs and then use // that mask to saturate the NaN containing lanes in the tmp register with 1s. @@ -2092,6 +2144,7 @@ fn lower_insn_to_regs>( dst, cond.encode(), false, + None, )); // The dst register holds a mask for lanes containing NaNs. @@ -2104,7 +2157,12 @@ fn lower_insn_to_regs>( // Finally we do a nand with the tmp register to produce the final results // in the dst. - ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst)); + ctx.emit(Inst::xmm_rm_r( + andn_op, + RegMem::reg(tmp_xmm1.to_reg()), + dst, + None, + )); } } } @@ -2327,7 +2385,7 @@ fn lower_insn_to_regs>( ctx.emit(inst); } - ctx.emit(Inst::xmm_rm_r(opcode, src, dst)); + ctx.emit(Inst::xmm_rm_r(opcode, src, dst, None)); } else { // Eventually vector constants should be available in `gen_constant` and this block // can be merged with the one above (TODO). @@ -2348,6 +2406,7 @@ fn lower_insn_to_regs>( tmp, cond.encode(), false, + None, ); ctx.emit(cmpps); @@ -2367,7 +2426,7 @@ fn lower_insn_to_regs>( ctx.emit(shift); // Apply shifted mask (XOR or AND). - let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst); + let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst, None); ctx.emit(mask); } else { panic!("unexpected type {:?} for Fabs", output_ty); @@ -2426,14 +2485,20 @@ fn lower_insn_to_regs>( dst, None, )); - ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst)); + ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst, None)); ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(rhs), tmp_xmm2, None)); ctx.emit(Inst::xmm_rm_r( and_op, RegMem::reg(tmp_xmm1.to_reg()), tmp_xmm2, + None, + )); + ctx.emit(Inst::xmm_rm_r( + or_op, + RegMem::reg(tmp_xmm2.to_reg()), + dst, + None, )); - ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(tmp_xmm2.to_reg()), dst)); } Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => { @@ -3154,7 +3219,12 @@ fn lower_insn_to_regs>( // After loading the constructed mask in a temporary register, we use this to // shuffle the `dst` register (remember that, in this case, it is the same as // `src` so we disregard this register). - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pshufb, + RegMem::from(tmp), + dst, + None, + )); } else { // If `lhs` and `rhs` are different, we must shuffle each separately and then OR // them together. This is necessary due to PSHUFB semantics. As in the case above, @@ -3166,7 +3236,12 @@ fn lower_insn_to_regs>( let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect(); let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I8X16); ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp1, ty)); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pshufb, + RegMem::from(tmp1), + tmp0, + None, + )); // PSHUFB the second argument, placing zeroes for unused lanes. let constructed_mask = mask @@ -3176,11 +3251,21 @@ fn lower_insn_to_regs>( .collect(); let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I8X16); ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp2, ty)); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pshufb, + RegMem::from(tmp2), + dst, + None, + )); // OR the shuffled registers (the mechanism and lane-size for OR-ing the registers // is not important). - ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Orps, + RegMem::from(tmp0), + dst, + None, + )); // TODO when AVX512 is enabled we should replace this sequence with a single VPERMB } @@ -3214,6 +3299,7 @@ fn lower_insn_to_regs>( SseOpcode::Paddusb, RegMem::from(zero_mask), swizzle_mask, + None, )); // Shuffle `dst` using the fixed-up `swizzle_mask`. @@ -3221,6 +3307,7 @@ fn lower_insn_to_regs>( SseOpcode::Pshufb, RegMem::from(swizzle_mask), dst, + None, )); } @@ -3240,7 +3327,7 @@ fn lower_insn_to_regs>( debug_assert!(lane < ty.lane_count() as u8); ctx.emit(Inst::gen_move(dst, in_vec, ty)); - emit_insert_lane(ctx, src, dst, lane, ty.lane_type()); + emit_insert_lane(ctx, src, dst, lane, ty.lane_type(), None); } Opcode::Extractlane => { @@ -3266,7 +3353,7 @@ fn lower_insn_to_regs>( _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()), }; let src = RegMem::reg(src); - ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit)); + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit, None)); } else { if lane == 0 { // Remove the extractlane instruction, leaving the float where it is. The upper @@ -3288,35 +3375,57 @@ fn lower_insn_to_regs>( _ => unreachable!(), }; let src = RegMem::reg(src); - ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false)); + ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false, None)); } } } - Opcode::Splat => { + Opcode::Splat | Opcode::LoadSplat => { let ty = ty.unwrap(); assert_eq!(ty.bits(), 128); let src_ty = ctx.input_ty(insn, 0); assert!(src_ty.bits() < 128); - let src = input_to_reg_mem(ctx, inputs[0]); + + let (src, srcloc) = match op { + Opcode::Splat => (input_to_reg_mem(ctx, inputs[0]), None), + Opcode::LoadSplat => { + let offset = ctx.data(insn).load_store_offset().unwrap(); + let amode = lower_to_amode(ctx, inputs[0], offset); + (RegMem::mem(amode), Some(ctx.srcloc(insn))) + } + _ => unreachable!(), + }; let dst = get_output_reg(ctx, outputs[0]); // We know that splat will overwrite all of the lanes of `dst` but it takes several // instructions to do so. Because of the multiple instructions, there is no good way to // declare `dst` a `def` except with the following pseudo-instruction. ctx.emit(Inst::xmm_uninit_value(dst)); + + // TODO: eventually many of these sequences could be optimized with AVX's VBROADCAST* + // and VPBROADCAST*. match ty.lane_bits() { 8 => { - emit_insert_lane(ctx, src, dst, 0, ty.lane_type()); + emit_insert_lane(ctx, src, dst, 0, ty.lane_type(), srcloc); // Initialize a register with all 0s. let tmp = ctx.alloc_tmp(RegClass::V128, ty); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pxor, + RegMem::from(tmp), + tmp, + srcloc, + )); // Shuffle the lowest byte lane to all other lanes. - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst)) + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pshufb, + RegMem::from(tmp), + dst, + srcloc, + )) } 16 => { - emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type()); - emit_insert_lane(ctx, src, dst, 1, ty.lane_type()); + emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type(), srcloc); + emit_insert_lane(ctx, src, dst, 1, ty.lane_type(), srcloc); // Shuffle the lowest two lanes to all other lanes. ctx.emit(Inst::xmm_rm_r_imm( SseOpcode::Pshufd, @@ -3324,10 +3433,11 @@ fn lower_insn_to_regs>( dst, 0, false, + srcloc, )) } 32 => { - emit_insert_lane(ctx, src, dst, 0, ty.lane_type()); + emit_insert_lane(ctx, src, dst, 0, ty.lane_type(), srcloc); // Shuffle the lowest lane to all other lanes. ctx.emit(Inst::xmm_rm_r_imm( SseOpcode::Pshufd, @@ -3335,11 +3445,12 @@ fn lower_insn_to_regs>( dst, 0, false, + srcloc, )) } 64 => { - emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type()); - emit_insert_lane(ctx, src, dst, 1, ty.lane_type()); + emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type(), srcloc); + emit_insert_lane(ctx, src, dst, 1, ty.lane_type(), srcloc); } _ => panic!("Invalid type to splat: {}", ty), } @@ -3373,9 +3484,14 @@ fn lower_insn_to_regs>( // Initialize a register with all 0s. let tmp = ctx.alloc_tmp(RegClass::V128, ty); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pxor, + RegMem::from(tmp), + tmp, + None, + )); // Compare to see what lanes are filled with all 1s. - ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp)); + ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp, None)); // Set the ZF if the result is all zeroes. ctx.emit(Inst::xmm_cmp_rm_r( SseOpcode::Ptest, diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs index 72890cffd9f0..976f1581e3cf 100644 --- a/cranelift/codegen/src/isa/x86/enc_tables.rs +++ b/cranelift/codegen/src/isa/x86/enc_tables.rs @@ -1892,3 +1892,31 @@ fn expand_tls_value( unreachable!(); } } + +fn expand_load_splat( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + _isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + + pos.use_srcloc(inst); + + let (ptr, offset, flags) = match pos.func.dfg[inst] { + ir::InstructionData::Load { + opcode: ir::Opcode::LoadSplat, + arg, + offset, + flags, + } => (arg, offset, flags), + _ => panic!( + "Expected load_splat: {}", + pos.func.dfg.display_inst(inst, None) + ), + }; + let ty = pos.func.dfg.ctrl_typevar(inst); + let load = pos.ins().load(ty.lane_type(), flags, ptr, offset); + + pos.func.dfg.replace(inst).splat(ty, load); +} diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index 7c827802ba4a..ef1804cf12b5 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -1380,19 +1380,17 @@ pub fn translate_operator( | Operator::V128Load16Splat { memarg } | Operator::V128Load32Splat { memarg } | Operator::V128Load64Splat { memarg } => { - // TODO: For spec compliance, this is initially implemented as a combination of `load + - // splat` but could be implemented eventually as a single instruction (`load_splat`). - // See https://github.com/bytecodealliance/wasmtime/issues/1175. - translate_load( + let opcode = ir::Opcode::LoadSplat; + let result_ty = type_of(op); + let (flags, base, offset) = prepare_load( memarg, - ir::Opcode::Load, - type_of(op).lane_type(), + mem_op_size(opcode, result_ty.lane_type()), builder, state, environ, )?; - let splatted = builder.ins().splat(type_of(op), state.pop1()); - state.push1(splatted) + let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base); + state.push1(dfg.first_result(load)) } Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => { let vector = pop1_with_bitcast(state, type_of(op), builder); @@ -2040,7 +2038,7 @@ fn mem_op_size(opcode: ir::Opcode, ty: Type) -> u32 { ir::Opcode::Istore8 | ir::Opcode::Sload8 | ir::Opcode::Uload8 => 1, ir::Opcode::Istore16 | ir::Opcode::Sload16 | ir::Opcode::Uload16 => 2, ir::Opcode::Istore32 | ir::Opcode::Sload32 | ir::Opcode::Uload32 => 4, - ir::Opcode::Store | ir::Opcode::Load => ty.bytes(), + ir::Opcode::Store | ir::Opcode::Load | ir::Opcode::LoadSplat => ty.bytes(), _ => panic!("unknown size of mem op for {:?}", opcode), } }