diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs
index de78c3b3b7f9..681b3104d552 100644
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -396,6 +396,7 @@ fn define_simd(
     let insertlane = insts.by_name("insertlane");
     let ishl = insts.by_name("ishl");
     let ishl_imm = insts.by_name("ishl_imm");
+    let load_splat = insts.by_name("load_splat");
     let raw_bitcast = insts.by_name("raw_bitcast");
     let scalar_to_vector = insts.by_name("scalar_to_vector");
     let splat = insts.by_name("splat");
@@ -820,6 +821,7 @@ fn define_simd(
     narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");
     narrow.custom_legalize(fmin, "expand_minmax_vector");
     narrow.custom_legalize(fmax, "expand_minmax_vector");
+    narrow.custom_legalize(load_splat, "expand_load_splat");
 
     narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
     narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
index 2c16734f2766..9cb77493c705 100644
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -4409,5 +4409,24 @@ pub(crate) fn define(
         .other_side_effects(true),
     );
 
+    let Offset = &Operand::new("Offset", &imm.offset32).with_doc("Byte offset from base address");
+    let a = &Operand::new("a", TxN);
+
+    ig.push(
+        Inst::new(
+            "load_splat",
+            r#"
+        Load an element from memory at ``p + Offset`` and return a vector
+        whose lanes are all set to that element.
+
+        This is equivalent to ``load`` followed by ``splat``.
+        "#,
+            &formats.load,
+        )
+        .operands_in(vec![MemFlags, p, Offset])
+        .operands_out(vec![a])
+        .can_load(true),
+    );
+
     ig.build()
 }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
index f85c1028ff2e..95bf4bb63fb8 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -680,4 +680,19 @@ impl VectorSize {
             _ => *self,
         }
     }
+
+    /// Return the encoding bits that are used by some SIMD instructions
+    /// for a particular operand size.
+    pub fn enc_size(&self) -> (u32, u32) {
+        let q = self.is_128bits() as u32;
+        let size = match self.lane_size() {
+            ScalarSize::Size8 => 0b00,
+            ScalarSize::Size16 => 0b01,
+            ScalarSize::Size32 => 0b10,
+            ScalarSize::Size64 => 0b11,
+            _ => unreachable!(),
+        };
+
+        (q, size)
+    }
 }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index d422fdc24f36..124fd36c87e7 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -248,6 +248,16 @@ fn enc_ldst_imm19(op_31_24: u32, imm19: u32, rd: Reg) -> u32 {
     (op_31_24 << 24) | (imm19 << 5) | machreg_to_gpr_or_vec(rd)
 }
 
+fn enc_ldst_vec(q: u32, size: u32, rn: Reg, rt: Writable<Reg>) -> u32 {
+    debug_assert_eq!(q & 0b1, q);
+    debug_assert_eq!(size & 0b11, size);
+    0b0_0_0011010_10_00000_110_0_00_00000_00000
+        | q << 30
+        | size << 10
+        | machreg_to_gpr(rn) << 5
+        | machreg_to_vec(rt.to_reg())
+}
+
 fn enc_extend(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
     (top22 << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())
 }
@@ -1380,14 +1390,7 @@ impl MachInstEmit for Inst {
                 sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
             }
             &Inst::VecMisc { op, rd, rn, size } => {
-                let enc_size = match size.lane_size() {
-                    ScalarSize::Size8 => 0b00,
-                    ScalarSize::Size16 => 0b01,
-                    ScalarSize::Size32 => 0b10,
-                    ScalarSize::Size64 => 0b11,
-                    _ => unreachable!(),
-                };
-                let q = if size.is_128bits() { 1 } else { 0 };
+                let (q, enc_size) = size.enc_size();
                 let (u, bits_12_16, size) = match op {
                     VecMisc2::Not => (0b1, 0b00101, 0b00),
                     VecMisc2::Neg => (0b1, 0b01011, enc_size),
@@ -1756,13 +1759,7 @@ impl MachInstEmit for Inst {
                 alu_op,
                 size,
             } => {
-                let enc_size = match size.lane_size() {
-                    ScalarSize::Size8 => 0b00,
-                    ScalarSize::Size16 => 0b01,
-                    ScalarSize::Size32 => 0b10,
-                    ScalarSize::Size64 => 0b11,
-                    _ => unreachable!(),
-                };
+                let (q, enc_size) = size.enc_size();
                 let is_float = match alu_op {
                     VecALUOp::Fcmeq
                     | VecALUOp::Fcmgt
@@ -1776,6 +1773,7 @@ impl MachInstEmit for Inst {
                     _ => false,
                 };
                 let enc_float_size = match (is_float, size) {
+                    (true, VectorSize::Size32x2) => 0b0,
                     (true, VectorSize::Size32x4) => 0b0,
                     (true, VectorSize::Size64x2) => 0b1,
                     (true, _) => unimplemented!(),
@@ -1783,58 +1781,73 @@ impl MachInstEmit for Inst {
                 };
 
                 let (top11, bit15_10) = match alu_op {
-                    VecALUOp::Sqadd => (0b010_01110_00_1 | enc_size << 1, 0b000011),
-                    VecALUOp::Sqsub => (0b010_01110_00_1 | enc_size << 1, 0b001011),
-                    VecALUOp::Uqadd => (0b011_01110_00_1 | enc_size << 1, 0b000011),
-                    VecALUOp::Uqsub => (0b011_01110_00_1 | enc_size << 1, 0b001011),
-                    VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011),
-                    VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111),
-                    VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101),
-                    VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size << 1, 0b001101),
-                    VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size << 1, 0b001111),
-                    VecALUOp::Fcmeq => (0b010_01110_00_1, 0b111001),
-                    VecALUOp::Fcmgt => (0b011_01110_10_1, 0b111001),
-                    VecALUOp::Fcmge => (0b011_01110_00_1, 0b111001),
+                    VecALUOp::Sqadd => (0b000_01110_00_1 | enc_size << 1, 0b000011),
+                    VecALUOp::Sqsub => (0b000_01110_00_1 | enc_size << 1, 0b001011),
+                    VecALUOp::Uqadd => (0b001_01110_00_1 | enc_size << 1, 0b000011),
+                    VecALUOp::Uqsub => (0b001_01110_00_1 | enc_size << 1, 0b001011),
+                    VecALUOp::Cmeq => (0b001_01110_00_1 | enc_size << 1, 0b100011),
+                    VecALUOp::Cmge => (0b000_01110_00_1 | enc_size << 1, 0b001111),
+                    VecALUOp::Cmgt => (0b000_01110_00_1 | enc_size << 1, 0b001101),
+                    VecALUOp::Cmhi => (0b001_01110_00_1 | enc_size << 1, 0b001101),
+                    VecALUOp::Cmhs => (0b001_01110_00_1 | enc_size << 1, 0b001111),
+                    VecALUOp::Fcmeq => (0b000_01110_00_1, 0b111001),
+                    VecALUOp::Fcmgt => (0b001_01110_10_1, 0b111001),
+                    VecALUOp::Fcmge => (0b001_01110_00_1, 0b111001),
                     // The following logical instructions operate on bytes, so are not encoded differently
                     // for the different vector types.
-                    VecALUOp::And => (0b010_01110_00_1, 0b000111),
-                    VecALUOp::Bic => (0b010_01110_01_1, 0b000111),
-                    VecALUOp::Orr => (0b010_01110_10_1, 0b000111),
-                    VecALUOp::Eor => (0b011_01110_00_1, 0b000111),
-                    VecALUOp::Bsl => (0b011_01110_01_1, 0b000111),
-                    VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001),
-                    VecALUOp::Add => (0b010_01110_00_1 | enc_size << 1, 0b100001),
-                    VecALUOp::Sub => (0b011_01110_00_1 | enc_size << 1, 0b100001),
+                    VecALUOp::And => (0b000_01110_00_1, 0b000111),
+                    VecALUOp::Bic => (0b000_01110_01_1, 0b000111),
+                    VecALUOp::Orr => (0b000_01110_10_1, 0b000111),
+                    VecALUOp::Eor => (0b001_01110_00_1, 0b000111),
+                    VecALUOp::Bsl => (0b001_01110_01_1, 0b000111),
+                    VecALUOp::Umaxp => (0b001_01110_00_1 | enc_size << 1, 0b101001),
+                    VecALUOp::Add => (0b000_01110_00_1 | enc_size << 1, 0b100001),
+                    VecALUOp::Sub => (0b001_01110_00_1 | enc_size << 1, 0b100001),
                     VecALUOp::Mul => {
                         debug_assert_ne!(size, VectorSize::Size64x2);
-                        (0b010_01110_00_1 | enc_size << 1, 0b100111)
+                        (0b000_01110_00_1 | enc_size << 1, 0b100111)
                     }
-                    VecALUOp::Sshl => (0b010_01110_00_1 | enc_size << 1, 0b010001),
-                    VecALUOp::Ushl => (0b011_01110_00_1 | enc_size << 1, 0b010001),
-                    VecALUOp::Umin => (0b011_01110_00_1 | enc_size << 1, 0b011011),
-                    VecALUOp::Smin => (0b010_01110_00_1 | enc_size << 1, 0b011011),
-                    VecALUOp::Umax => (0b011_01110_00_1 | enc_size << 1, 0b011001),
-                    VecALUOp::Smax => (0b010_01110_00_1 | enc_size << 1, 0b011001),
-                    VecALUOp::Urhadd => (0b011_01110_00_1 | enc_size << 1, 0b000101),
-                    VecALUOp::Fadd => (0b010_01110_00_1, 0b110101),
-                    VecALUOp::Fsub => (0b010_01110_10_1, 0b110101),
-                    VecALUOp::Fdiv => (0b011_01110_00_1, 0b111111),
-                    VecALUOp::Fmax => (0b010_01110_00_1, 0b111101),
-                    VecALUOp::Fmin => (0b010_01110_10_1, 0b111101),
-                    VecALUOp::Fmul => (0b011_01110_00_1, 0b110111),
-                    VecALUOp::Addp => (0b010_01110_00_1 | enc_size << 1, 0b101111),
+                    VecALUOp::Sshl => (0b000_01110_00_1 | enc_size << 1, 0b010001),
+                    VecALUOp::Ushl => (0b001_01110_00_1 | enc_size << 1, 0b010001),
+                    VecALUOp::Umin => (0b001_01110_00_1 | enc_size << 1, 0b011011),
+                    VecALUOp::Smin => (0b000_01110_00_1 | enc_size << 1, 0b011011),
+                    VecALUOp::Umax => (0b001_01110_00_1 | enc_size << 1, 0b011001),
+                    VecALUOp::Smax => (0b000_01110_00_1 | enc_size << 1, 0b011001),
+                    VecALUOp::Urhadd => (0b001_01110_00_1 | enc_size << 1, 0b000101),
+                    VecALUOp::Fadd => (0b000_01110_00_1, 0b110101),
+                    VecALUOp::Fsub => (0b000_01110_10_1, 0b110101),
+                    VecALUOp::Fdiv => (0b001_01110_00_1, 0b111111),
+                    VecALUOp::Fmax => (0b000_01110_00_1, 0b111101),
+                    VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
+                    VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
+                    VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
                     VecALUOp::Umlal => {
                         debug_assert!(!size.is_128bits());
                         (0b001_01110_00_1 | enc_size << 1, 0b100000)
                     }
                 };
                 let top11 = if is_float {
-                    top11 | enc_float_size << 1
+                    top11 | (q << 9) | enc_float_size << 1
                 } else {
-                    top11
+                    top11 | (q << 9)
                 };
                 sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
             }
+            &Inst::VecLoadReplicate {
+                rd,
+                rn,
+                size,
+                srcloc,
+            } => {
+                let (q, size) = size.enc_size();
+
+                if let Some(srcloc) = srcloc {
+                    // Register the offset at which the actual load instruction starts.
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+
+                sink.put4(enc_ldst_vec(q, size, rn, rd));
+            }
             &Inst::MovToNZCV { rn } => {
                 sink.put4(0xd51b4200 | machreg_to_gpr(rn));
             }
@@ -2119,9 +2132,12 @@ impl MachInstEmit for Inst {
                     inst.emit(sink, emit_info, state);
                 }
 
-                let (reg, offset) = match mem {
-                    AMode::Unscaled(r, simm9) => (r, simm9.value()),
-                    AMode::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32),
+                let (reg, index_reg, offset) = match mem {
+                    AMode::RegExtended(r, idx, extendop) => (r, Some((idx, extendop)), 0),
+                    AMode::Unscaled(r, simm9) => (r, None, simm9.value()),
+                    AMode::UnsignedOffset(r, uimm12scaled) => {
+                        (r, None, uimm12scaled.value() as i32)
+                    }
                     _ => panic!("Unsupported case for LoadAddr: {:?}", mem),
                 };
                 let abs_offset = if offset < 0 {
@@ -2135,9 +2151,22 @@ impl MachInstEmit for Inst {
                     ALUOp::Add64
                 };
 
-                if offset == 0 {
-                    let mov = Inst::mov(rd, reg);
-                    mov.emit(sink, emit_info, state);
+                if let Some((idx, extendop)) = index_reg {
+                    let add = Inst::AluRRRExtend {
+                        alu_op: ALUOp::Add64,
+                        rd,
+                        rn: reg,
+                        rm: idx,
+                        extendop,
+                    };
+
+                    add.emit(sink, emit_info, state);
+                } else if offset == 0 {
+                    if reg != rd.to_reg() {
+                        let mov = Inst::mov(rd, reg);
+
+                        mov.emit(sink, emit_info, state);
+                    }
                 } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
                     let add = Inst::AluRRImm12 {
                         alu_op,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index 6d981c2eaa93..48707610ffb6 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -2533,10 +2533,10 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(28),
             rn: vreg(12),
             rm: vreg(4),
-            size: VectorSize::Size32x4,
+            size: VectorSize::Size32x2,
         },
-        "9CE5244E",
-        "fcmeq v28.4s, v12.4s, v4.4s",
+        "9CE5240E",
+        "fcmeq v28.2s, v12.2s, v4.2s",
     ));
 
     insns.push((
@@ -2965,10 +2965,10 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(6),
             rn: vreg(9),
             rm: vreg(8),
-            size: VectorSize::Size8x16,
+            size: VectorSize::Size8x8,
         },
-        "2665286E",
-        "umax v6.16b, v9.16b, v8.16b",
+        "2665282E",
+        "umax v6.8b, v9.8b, v8.8b",
     ));
 
     insns.push((
@@ -3507,6 +3507,28 @@ fn test_aarch64_binemit() {
         "tbx v3.16b, { v11.16b, v12.16b }, v19.16b",
     ));
 
+    insns.push((
+        Inst::VecLoadReplicate {
+            rd: writable_vreg(31),
+            rn: xreg(0),
+            srcloc: None,
+            size: VectorSize::Size64x2,
+        },
+        "1FCC404D",
+        "ld1r { v31.2d }, [x0]",
+    ));
+
+    insns.push((
+        Inst::VecLoadReplicate {
+            rd: writable_vreg(0),
+            rn: xreg(25),
+            srcloc: None,
+            size: VectorSize::Size8x8,
+        },
+        "20C3400D",
+        "ld1r { v0.8b }, [x25]",
+    ));
+
     insns.push((
         Inst::Extend {
             rd: writable_xreg(1),
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 544d04c23cec..e9c0f15425a2 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -975,6 +975,14 @@ pub enum Inst {
         is_extension: bool,
     },
 
+    /// Load an element and replicate to all lanes of a vector.
+    VecLoadReplicate {
+        rd: Writable<Reg>,
+        rn: Reg,
+        size: VectorSize,
+        srcloc: Option<SourceLoc>,
+    },
+
     /// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn).
     MovToNZCV {
         rn: Reg,
@@ -1609,7 +1617,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
                 collector.add_def(rd);
             }
         }
-
+        &Inst::VecLoadReplicate { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
         &Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => {
             collector.add_use(rn);
             collector.add_use(rm);
@@ -1762,8 +1773,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
         &Inst::LoadExtName { rd, .. } => {
             collector.add_def(rd);
         }
-        &Inst::LoadAddr { rd, mem: _ } => {
+        &Inst::LoadAddr { rd, ref mem } => {
             collector.add_def(rd);
+            memarg_regs(mem, collector);
         }
         &Inst::VirtualSPOffsetAdj { .. } => {}
         &Inst::EmitIsland { .. } => {}
@@ -2189,6 +2201,14 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
                 map_def(mapper, rd);
             }
         }
+        &mut Inst::VecLoadReplicate {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
         &mut Inst::FpuCmp32 {
             ref mut rn,
             ref mut rm,
@@ -3412,6 +3432,12 @@ impl Inst {
                 let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16);
                 format!("{} {}, {{ {}, {} }}, {}", op, rd, rn, rn2, rm)
             }
+            &Inst::VecLoadReplicate { rd, rn, size, .. } => {
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = rn.show_rru(mb_rru);
+
+                format!("ld1r {{ {} }}, [{}]", rd, rn)
+            }
             &Inst::MovToNZCV { rn } => {
                 let rn = rn.show_rru(mb_rru);
                 format!("msr nzcv, {}", rn)
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index fc28cb35818e..ecdcb9c6d1d2 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1197,6 +1197,29 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             }
         }
 
+        Opcode::LoadSplat => {
+            let off = ctx.data(insn).load_store_offset().unwrap();
+            let ty = ty.unwrap();
+            let mem = lower_address(ctx, ty.lane_type(), &inputs[..], off);
+            let memflags = ctx.memflags(insn).expect("memory flags");
+            let rd = get_output_reg(ctx, outputs[0]);
+            let size = VectorSize::from_ty(ty);
+            let srcloc = if memflags.notrap() {
+                None
+            } else {
+                Some(ctx.srcloc(insn))
+            };
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
+
+            ctx.emit(Inst::LoadAddr { rd: tmp, mem });
+            ctx.emit(Inst::VecLoadReplicate {
+                rd,
+                rn: tmp.to_reg(),
+                size,
+                srcloc,
+            });
+        }
+
         Opcode::Store
         | Opcode::Istore8
         | Opcode::Istore16
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index 688e620d8302..397b21f69d3c 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1728,6 +1728,7 @@ pub(crate) fn emit(
             op,
             src: src_e,
             dst: reg_g,
+            srcloc,
         } => {
             let rex = RexFlags::clear_w();
             let (prefix, opcode, length) = match op {
@@ -1819,6 +1820,10 @@ pub(crate) fn emit(
                     emit_std_reg_reg(sink, prefix, opcode, length, reg_g.to_reg(), *reg_e, rex);
                 }
                 RegMem::Mem { addr } => {
+                    if let Some(srcloc) = *srcloc {
+                        // Register the offset at which the actual load instruction starts.
+                        sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                    }
                     let addr = &addr.finalize(state);
                     emit_std_reg_mem(sink, prefix, opcode, length, reg_g.to_reg(), addr, rex);
                 }
@@ -1889,7 +1894,7 @@ pub(crate) fn emit(
             // and negative zero. These instructions merge the sign bits in that
             // case, and are no-ops otherwise.
             let op = if *is_min { or_op } else { and_op };
-            let inst = Inst::xmm_rm_r(op, RegMem::reg(*lhs), *rhs_dst);
+            let inst = Inst::xmm_rm_r(op, RegMem::reg(*lhs), *rhs_dst, None);
             inst.emit(sink, info, state);
 
             let inst = Inst::jmp_known(done);
@@ -1899,13 +1904,13 @@ pub(crate) fn emit(
             // read-only operand: perform an addition between the two operands, which has the
             // desired NaN propagation effects.
             sink.bind_label(propagate_nan);
-            let inst = Inst::xmm_rm_r(add_op, RegMem::reg(*lhs), *rhs_dst);
+            let inst = Inst::xmm_rm_r(add_op, RegMem::reg(*lhs), *rhs_dst, None);
             inst.emit(sink, info, state);
 
             one_way_jmp(sink, CC::P, done);
 
             sink.bind_label(do_min_max);
-            let inst = Inst::xmm_rm_r(min_max_op, RegMem::reg(*lhs), *rhs_dst);
+            let inst = Inst::xmm_rm_r(min_max_op, RegMem::reg(*lhs), *rhs_dst, None);
             inst.emit(sink, info, state);
 
             sink.bind_label(done);
@@ -1916,7 +1921,8 @@ pub(crate) fn emit(
             src,
             dst,
             imm,
-            is64: w,
+            is64,
+            srcloc,
         } => {
             let (prefix, opcode, len) = match op {
                 SseOpcode::Cmpps => (LegacyPrefixes::None, 0x0FC2, 2),
@@ -1933,7 +1939,7 @@ pub(crate) fn emit(
                 SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2),
                 _ => unimplemented!("Opcode {:?} not implemented", op),
             };
-            let rex = if *w {
+            let rex = if *is64 {
                 RexFlags::set_w()
             } else {
                 RexFlags::clear_w()
@@ -1955,6 +1961,10 @@ pub(crate) fn emit(
                     }
                 }
                 RegMem::Mem { addr } => {
+                    if let Some(srcloc) = *srcloc {
+                        // Register the offset at which the actual load instruction starts.
+                        sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                    }
                     let addr = &addr.finalize(state);
                     assert!(
                         !regs_swapped,
@@ -1963,7 +1973,7 @@ pub(crate) fn emit(
                     emit_std_reg_mem(sink, prefix, opcode, len, dst.to_reg(), addr, rex);
                 }
             }
-            sink.put1(*imm)
+            sink.put1(*imm);
         }
 
         Inst::XmmLoadConstSeq { val, dst, ty } => {
@@ -2188,7 +2198,7 @@ pub(crate) fn emit(
             } else {
                 SseOpcode::Addss
             };
-            let inst = Inst::xmm_rm_r(add_op, RegMem::reg(dst.to_reg()), *dst);
+            let inst = Inst::xmm_rm_r(add_op, RegMem::reg(dst.to_reg()), *dst, None);
             inst.emit(sink, info, state);
 
             sink.bind_label(done);
@@ -2295,8 +2305,12 @@ pub(crate) fn emit(
                 // If the input was positive, saturate to INT_MAX.
 
                 // Zero out tmp_xmm.
-                let inst =
-                    Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), *tmp_xmm);
+                let inst = Inst::xmm_rm_r(
+                    SseOpcode::Xorpd,
+                    RegMem::reg(tmp_xmm.to_reg()),
+                    *tmp_xmm,
+                    None,
+                );
                 inst.emit(sink, info, state);
 
                 let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg());
@@ -2367,8 +2381,12 @@ pub(crate) fn emit(
                 sink.bind_label(check_positive);
 
                 // Zero out the tmp_xmm register.
-                let inst =
-                    Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), *tmp_xmm);
+                let inst = Inst::xmm_rm_r(
+                    SseOpcode::Xorpd,
+                    RegMem::reg(tmp_xmm.to_reg()),
+                    *tmp_xmm,
+                    None,
+                );
                 inst.emit(sink, info, state);
 
                 let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg());
@@ -2522,7 +2540,7 @@ pub(crate) fn emit(
 
             sink.bind_label(handle_large);
 
-            let inst = Inst::xmm_rm_r(sub_op, RegMem::reg(tmp_xmm.to_reg()), *src);
+            let inst = Inst::xmm_rm_r(sub_op, RegMem::reg(tmp_xmm.to_reg()), *src, None);
             inst.emit(sink, info, state);
 
             let inst = Inst::xmm_to_gpr(trunc_op, src.to_reg(), *dst, *dst_size);
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index 62992be2bd49..71120a101df1 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -2983,12 +2983,12 @@ fn test_x64_emit() {
     // XMM_RM_R: float binary ops
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm1), w_xmm0),
+        Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm1), w_xmm0, None),
         "F30F58C1",
         "addss   %xmm1, %xmm0",
     ));
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm11), w_xmm13),
+        Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm11), w_xmm13, None),
         "F3450F58EB",
         "addss   %xmm11, %xmm13",
     ));
@@ -2997,23 +2997,24 @@ fn test_x64_emit() {
             SseOpcode::Addss,
             RegMem::mem(Amode::imm_reg_reg_shift(123, r10, rdx, 2)),
             w_xmm0,
+            None,
         ),
         "F3410F5844927B",
         "addss   123(%r10,%rdx,4), %xmm0",
     ));
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Addsd, RegMem::reg(xmm15), w_xmm4),
+        Inst::xmm_rm_r(SseOpcode::Addsd, RegMem::reg(xmm15), w_xmm4, None),
         "F2410F58E7",
         "addsd   %xmm15, %xmm4",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm0), w_xmm1),
+        Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm0), w_xmm1, None),
         "F30F5CC8",
         "subss   %xmm0, %xmm1",
     ));
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm12), w_xmm1),
+        Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm12), w_xmm1, None),
         "F3410F5CCC",
         "subss   %xmm12, %xmm1",
     ));
@@ -3022,57 +3023,58 @@ fn test_x64_emit() {
             SseOpcode::Subss,
             RegMem::mem(Amode::imm_reg_reg_shift(321, r10, rax, 3)),
             w_xmm10,
+            None,
         ),
         "F3450F5C94C241010000",
         "subss   321(%r10,%rax,8), %xmm10",
     ));
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Subsd, RegMem::reg(xmm5), w_xmm14),
+        Inst::xmm_rm_r(SseOpcode::Subsd, RegMem::reg(xmm5), w_xmm14, None),
         "F2440F5CF5",
         "subsd   %xmm5, %xmm14",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Mulss, RegMem::reg(xmm5), w_xmm4),
+        Inst::xmm_rm_r(SseOpcode::Mulss, RegMem::reg(xmm5), w_xmm4, None),
         "F30F59E5",
         "mulss   %xmm5, %xmm4",
     ));
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Mulsd, RegMem::reg(xmm5), w_xmm4),
+        Inst::xmm_rm_r(SseOpcode::Mulsd, RegMem::reg(xmm5), w_xmm4, None),
         "F20F59E5",
         "mulsd   %xmm5, %xmm4",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Divss, RegMem::reg(xmm8), w_xmm7),
+        Inst::xmm_rm_r(SseOpcode::Divss, RegMem::reg(xmm8), w_xmm7, None),
         "F3410F5EF8",
         "divss   %xmm8, %xmm7",
     ));
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Divsd, RegMem::reg(xmm5), w_xmm4),
+        Inst::xmm_rm_r(SseOpcode::Divsd, RegMem::reg(xmm5), w_xmm4, None),
         "F20F5EE5",
         "divsd   %xmm5, %xmm4",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Andps, RegMem::reg(xmm3), w_xmm12),
+        Inst::xmm_rm_r(SseOpcode::Andps, RegMem::reg(xmm3), w_xmm12, None),
         "440F54E3",
         "andps   %xmm3, %xmm12",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Andnps, RegMem::reg(xmm4), w_xmm11),
+        Inst::xmm_rm_r(SseOpcode::Andnps, RegMem::reg(xmm4), w_xmm11, None),
         "440F55DC",
         "andnps  %xmm4, %xmm11",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm1), w_xmm15),
+        Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm1), w_xmm15, None),
         "440F56F9",
         "orps    %xmm1, %xmm15",
     ));
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm5), w_xmm4),
+        Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm5), w_xmm4, None),
         "0F56E5",
         "orps    %xmm5, %xmm4",
     ));
@@ -3081,211 +3083,211 @@ fn test_x64_emit() {
     // XMM_RM_R: Integer Packed
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Paddb, RegMem::reg(xmm9), w_xmm5),
+        Inst::xmm_rm_r(SseOpcode::Paddb, RegMem::reg(xmm9), w_xmm5, None),
         "66410FFCE9",
         "paddb   %xmm9, %xmm5",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Paddw, RegMem::reg(xmm7), w_xmm6),
+        Inst::xmm_rm_r(SseOpcode::Paddw, RegMem::reg(xmm7), w_xmm6, None),
         "660FFDF7",
         "paddw   %xmm7, %xmm6",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::reg(xmm12), w_xmm13),
+        Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::reg(xmm12), w_xmm13, None),
         "66450FFEEC",
         "paddd   %xmm12, %xmm13",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Paddq, RegMem::reg(xmm1), w_xmm8),
+        Inst::xmm_rm_r(SseOpcode::Paddq, RegMem::reg(xmm1), w_xmm8, None),
         "66440FD4C1",
         "paddq   %xmm1, %xmm8",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Paddsb, RegMem::reg(xmm9), w_xmm5),
+        Inst::xmm_rm_r(SseOpcode::Paddsb, RegMem::reg(xmm9), w_xmm5, None),
         "66410FECE9",
         "paddsb  %xmm9, %xmm5",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Paddsw, RegMem::reg(xmm7), w_xmm6),
+        Inst::xmm_rm_r(SseOpcode::Paddsw, RegMem::reg(xmm7), w_xmm6, None),
         "660FEDF7",
         "paddsw  %xmm7, %xmm6",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Paddusb, RegMem::reg(xmm12), w_xmm13),
+        Inst::xmm_rm_r(SseOpcode::Paddusb, RegMem::reg(xmm12), w_xmm13, None),
         "66450FDCEC",
         "paddusb %xmm12, %xmm13",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Paddusw, RegMem::reg(xmm1), w_xmm8),
+        Inst::xmm_rm_r(SseOpcode::Paddusw, RegMem::reg(xmm1), w_xmm8, None),
         "66440FDDC1",
         "paddusw %xmm1, %xmm8",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Psubsb, RegMem::reg(xmm9), w_xmm5),
+        Inst::xmm_rm_r(SseOpcode::Psubsb, RegMem::reg(xmm9), w_xmm5, None),
         "66410FE8E9",
         "psubsb  %xmm9, %xmm5",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Psubsw, RegMem::reg(xmm7), w_xmm6),
+        Inst::xmm_rm_r(SseOpcode::Psubsw, RegMem::reg(xmm7), w_xmm6, None),
         "660FE9F7",
         "psubsw  %xmm7, %xmm6",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Psubusb, RegMem::reg(xmm12), w_xmm13),
+        Inst::xmm_rm_r(SseOpcode::Psubusb, RegMem::reg(xmm12), w_xmm13, None),
         "66450FD8EC",
         "psubusb %xmm12, %xmm13",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Psubusw, RegMem::reg(xmm1), w_xmm8),
+        Inst::xmm_rm_r(SseOpcode::Psubusw, RegMem::reg(xmm1), w_xmm8, None),
         "66440FD9C1",
         "psubusw %xmm1, %xmm8",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pavgb, RegMem::reg(xmm12), w_xmm13),
+        Inst::xmm_rm_r(SseOpcode::Pavgb, RegMem::reg(xmm12), w_xmm13, None),
         "66450FE0EC",
         "pavgb   %xmm12, %xmm13",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pavgw, RegMem::reg(xmm1), w_xmm8),
+        Inst::xmm_rm_r(SseOpcode::Pavgw, RegMem::reg(xmm1), w_xmm8, None),
         "66440FE3C1",
         "pavgw   %xmm1, %xmm8",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Psubb, RegMem::reg(xmm5), w_xmm9),
+        Inst::xmm_rm_r(SseOpcode::Psubb, RegMem::reg(xmm5), w_xmm9, None),
         "66440FF8CD",
         "psubb   %xmm5, %xmm9",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Psubw, RegMem::reg(xmm6), w_xmm7),
+        Inst::xmm_rm_r(SseOpcode::Psubw, RegMem::reg(xmm6), w_xmm7, None),
         "660FF9FE",
         "psubw   %xmm6, %xmm7",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::reg(xmm13), w_xmm12),
+        Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::reg(xmm13), w_xmm12, None),
         "66450FFAE5",
         "psubd   %xmm13, %xmm12",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Psubq, RegMem::reg(xmm8), w_xmm1),
+        Inst::xmm_rm_r(SseOpcode::Psubq, RegMem::reg(xmm8), w_xmm1, None),
         "66410FFBC8",
         "psubq   %xmm8, %xmm1",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmulld, RegMem::reg(xmm15), w_xmm6),
+        Inst::xmm_rm_r(SseOpcode::Pmulld, RegMem::reg(xmm15), w_xmm6, None),
         "66410F3840F7",
         "pmulld  %xmm15, %xmm6",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(xmm14), w_xmm1),
+        Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(xmm14), w_xmm1, None),
         "66410FD5CE",
         "pmullw  %xmm14, %xmm1",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9),
+        Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9, None),
         "66450FF4C8",
         "pmuludq %xmm8, %xmm9",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmaxsb, RegMem::reg(xmm15), w_xmm6),
+        Inst::xmm_rm_r(SseOpcode::Pmaxsb, RegMem::reg(xmm15), w_xmm6, None),
         "66410F383CF7",
         "pmaxsb  %xmm15, %xmm6",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmaxsw, RegMem::reg(xmm15), w_xmm6),
+        Inst::xmm_rm_r(SseOpcode::Pmaxsw, RegMem::reg(xmm15), w_xmm6, None),
         "66410FEEF7",
         "pmaxsw  %xmm15, %xmm6",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::reg(xmm15), w_xmm6),
+        Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::reg(xmm15), w_xmm6, None),
         "66410F383DF7",
         "pmaxsd  %xmm15, %xmm6",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmaxub, RegMem::reg(xmm14), w_xmm1),
+        Inst::xmm_rm_r(SseOpcode::Pmaxub, RegMem::reg(xmm14), w_xmm1, None),
         "66410FDECE",
         "pmaxub  %xmm14, %xmm1",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmaxuw, RegMem::reg(xmm14), w_xmm1),
+        Inst::xmm_rm_r(SseOpcode::Pmaxuw, RegMem::reg(xmm14), w_xmm1, None),
         "66410F383ECE",
         "pmaxuw  %xmm14, %xmm1",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmaxud, RegMem::reg(xmm14), w_xmm1),
+        Inst::xmm_rm_r(SseOpcode::Pmaxud, RegMem::reg(xmm14), w_xmm1, None),
         "66410F383FCE",
         "pmaxud  %xmm14, %xmm1",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pminsb, RegMem::reg(xmm8), w_xmm9),
+        Inst::xmm_rm_r(SseOpcode::Pminsb, RegMem::reg(xmm8), w_xmm9, None),
         "66450F3838C8",
         "pminsb  %xmm8, %xmm9",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pminsw, RegMem::reg(xmm8), w_xmm9),
+        Inst::xmm_rm_r(SseOpcode::Pminsw, RegMem::reg(xmm8), w_xmm9, None),
         "66450FEAC8",
         "pminsw  %xmm8, %xmm9",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pminsd, RegMem::reg(xmm8), w_xmm9),
+        Inst::xmm_rm_r(SseOpcode::Pminsd, RegMem::reg(xmm8), w_xmm9, None),
         "66450F3839C8",
         "pminsd  %xmm8, %xmm9",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pminub, RegMem::reg(xmm3), w_xmm2),
+        Inst::xmm_rm_r(SseOpcode::Pminub, RegMem::reg(xmm3), w_xmm2, None),
         "660FDAD3",
         "pminub  %xmm3, %xmm2",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pminuw, RegMem::reg(xmm3), w_xmm2),
+        Inst::xmm_rm_r(SseOpcode::Pminuw, RegMem::reg(xmm3), w_xmm2, None),
         "660F383AD3",
         "pminuw  %xmm3, %xmm2",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pminud, RegMem::reg(xmm3), w_xmm2),
+        Inst::xmm_rm_r(SseOpcode::Pminud, RegMem::reg(xmm3), w_xmm2, None),
         "660F383BD3",
         "pminud  %xmm3, %xmm2",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::reg(xmm11), w_xmm2),
+        Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::reg(xmm11), w_xmm2, None),
         "66410FEFD3",
         "pxor    %xmm11, %xmm2",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::reg(xmm11), w_xmm2),
+        Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::reg(xmm11), w_xmm2, None),
         "66410F3800D3",
         "pshufb  %xmm11, %xmm2",
     ));
@@ -3488,12 +3490,12 @@ fn test_x64_emit() {
     // ========================================================
     // XmmRmRImm
     insns.push((
-        Inst::xmm_rm_r_imm(SseOpcode::Cmppd, RegMem::reg(xmm5), w_xmm1, 2, false),
+        Inst::xmm_rm_r_imm(SseOpcode::Cmppd, RegMem::reg(xmm5), w_xmm1, 2, false, None),
         "660FC2CD02",
         "cmppd   $2, %xmm5, %xmm1",
     ));
     insns.push((
-        Inst::xmm_rm_r_imm(SseOpcode::Cmpps, RegMem::reg(xmm15), w_xmm7, 0, false),
+        Inst::xmm_rm_r_imm(SseOpcode::Cmpps, RegMem::reg(xmm15), w_xmm7, 0, false, None),
         "410FC2FF00",
         "cmpps   $0, %xmm15, %xmm7",
     ));
diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
index 1fe0de694183..aac925db626f 100644
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -212,6 +212,7 @@ pub enum Inst {
         op: SseOpcode,
         src: RegMem,
         dst: Writable<Reg>,
+        srcloc: Option<SourceLoc>,
     },
 
     /// XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg, sqrt,
@@ -338,6 +339,7 @@ pub enum Inst {
         dst: Writable<Reg>,
         imm: u8,
         is64: bool,
+        srcloc: Option<SourceLoc>,
     },
 
     // =====================================
@@ -711,10 +713,20 @@ impl Inst {
         }
     }
 
-    pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Self {
+    pub(crate) fn xmm_rm_r(
+        op: SseOpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+        srcloc: Option<SourceLoc>,
+    ) -> Self {
         src.assert_regclass_is(RegClass::V128);
         debug_assert!(dst.to_reg().get_class() == RegClass::V128);
-        Inst::XmmRmR { op, src, dst }
+        Inst::XmmRmR {
+            op,
+            src,
+            dst,
+            srcloc,
+        }
     }
 
     pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
@@ -869,6 +881,7 @@ impl Inst {
         dst: Writable<Reg>,
         imm: u8,
         is64: bool,
+        srcloc: Option<SourceLoc>,
     ) -> Inst {
         Inst::XmmRmRImm {
             op,
@@ -876,6 +889,7 @@ impl Inst {
             dst,
             imm,
             is64,
+            srcloc,
         }
     }
 
@@ -1233,16 +1247,26 @@ impl Inst {
     /// Choose which instruction to use for comparing two values for equality.
     pub(crate) fn equals(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
         match ty {
-            types::I8X16 | types::B8X16 => Inst::xmm_rm_r(SseOpcode::Pcmpeqb, from, to),
-            types::I16X8 | types::B16X8 => Inst::xmm_rm_r(SseOpcode::Pcmpeqw, from, to),
-            types::I32X4 | types::B32X4 => Inst::xmm_rm_r(SseOpcode::Pcmpeqd, from, to),
-            types::I64X2 | types::B64X2 => Inst::xmm_rm_r(SseOpcode::Pcmpeqq, from, to),
-            types::F32X4 => {
-                Inst::xmm_rm_r_imm(SseOpcode::Cmpps, from, to, FcmpImm::Equal.encode(), false)
-            }
-            types::F64X2 => {
-                Inst::xmm_rm_r_imm(SseOpcode::Cmppd, from, to, FcmpImm::Equal.encode(), false)
-            }
+            types::I8X16 | types::B8X16 => Inst::xmm_rm_r(SseOpcode::Pcmpeqb, from, to, None),
+            types::I16X8 | types::B16X8 => Inst::xmm_rm_r(SseOpcode::Pcmpeqw, from, to, None),
+            types::I32X4 | types::B32X4 => Inst::xmm_rm_r(SseOpcode::Pcmpeqd, from, to, None),
+            types::I64X2 | types::B64X2 => Inst::xmm_rm_r(SseOpcode::Pcmpeqq, from, to, None),
+            types::F32X4 => Inst::xmm_rm_r_imm(
+                SseOpcode::Cmpps,
+                from,
+                to,
+                FcmpImm::Equal.encode(),
+                false,
+                None,
+            ),
+            types::F64X2 => Inst::xmm_rm_r_imm(
+                SseOpcode::Cmppd,
+                from,
+                to,
+                FcmpImm::Equal.encode(),
+                false,
+                None,
+            ),
             _ => unimplemented!("unimplemented type for Inst::equals: {}", ty),
         }
     }
@@ -1250,9 +1274,11 @@ impl Inst {
     /// Choose which instruction to use for computing a bitwise AND on two values.
     pub(crate) fn and(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
         match ty {
-            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andps, from, to),
-            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andpd, from, to),
-            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pand, from, to),
+            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andps, from, to, None),
+            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andpd, from, to, None),
+            _ if ty.is_vector() && ty.bits() == 128 => {
+                Inst::xmm_rm_r(SseOpcode::Pand, from, to, None)
+            }
             _ => unimplemented!("unimplemented type for Inst::and: {}", ty),
         }
     }
@@ -1260,9 +1286,11 @@ impl Inst {
     /// Choose which instruction to use for computing a bitwise AND NOT on two values.
     pub(crate) fn and_not(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
         match ty {
-            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andnps, from, to),
-            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andnpd, from, to),
-            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pandn, from, to),
+            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andnps, from, to, None),
+            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andnpd, from, to, None),
+            _ if ty.is_vector() && ty.bits() == 128 => {
+                Inst::xmm_rm_r(SseOpcode::Pandn, from, to, None)
+            }
             _ => unimplemented!("unimplemented type for Inst::and_not: {}", ty),
         }
     }
@@ -1270,9 +1298,11 @@ impl Inst {
     /// Choose which instruction to use for computing a bitwise OR on two values.
     pub(crate) fn or(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
         match ty {
-            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Orps, from, to),
-            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Orpd, from, to),
-            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Por, from, to),
+            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Orps, from, to, None),
+            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Orpd, from, to, None),
+            _ if ty.is_vector() && ty.bits() == 128 => {
+                Inst::xmm_rm_r(SseOpcode::Por, from, to, None)
+            }
             _ => unimplemented!("unimplemented type for Inst::or: {}", ty),
         }
     }
@@ -1280,9 +1310,11 @@ impl Inst {
     /// Choose which instruction to use for computing a bitwise XOR on two values.
     pub(crate) fn xor(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
         match ty {
-            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Xorps, from, to),
-            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Xorpd, from, to),
-            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pxor, from, to),
+            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Xorps, from, to, None),
+            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Xorpd, from, to, None),
+            _ if ty.is_vector() && ty.bits() == 128 => {
+                Inst::xmm_rm_r(SseOpcode::Pxor, from, to, None)
+            }
             _ => unimplemented!("unimplemented type for Inst::xor: {}", ty),
         }
     }
@@ -1429,7 +1461,7 @@ impl PrettyPrint for Inst {
                 dst.show_rru(mb_rru),
             ),
 
-            Inst::XmmRmR { op, src, dst } => format!(
+            Inst::XmmRmR { op, src, dst, .. } => format!(
                 "{} {}, {}",
                 ljustify(op.to_string()),
                 src.show_rru_sized(mb_rru, 8),
@@ -1459,7 +1491,7 @@ impl PrettyPrint for Inst {
                 show_ireg_sized(rhs_dst.to_reg(), mb_rru, 8),
             ),
 
-            Inst::XmmRmRImm { op, src, dst, imm, is64 } => format!(
+            Inst::XmmRmRImm { op, src, dst, imm, is64, .. } => format!(
                 "{} ${}, {}, {}",
                 ljustify(format!("{}{}", op.to_string(), if *is64 { ".w" } else { "" })),
                 imm,
@@ -2595,6 +2627,7 @@ impl MachInst for Inst {
                     SseOpcode::Xorps,
                     RegMem::reg(to_reg.to_reg()),
                     to_reg,
+                    None,
                 ));
             } else {
                 let tmp = alloc_tmp(RegClass::I64, types::I32);
@@ -2613,6 +2646,7 @@ impl MachInst for Inst {
                     SseOpcode::Xorpd,
                     RegMem::reg(to_reg.to_reg()),
                     to_reg,
+                    None,
                 ));
             } else {
                 let tmp = alloc_tmp(RegClass::I64, types::I64);
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index 108072b97cce..614589d1603d 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -3,7 +3,7 @@
 use crate::data_value::DataValue;
 use crate::ir::{
     condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName,
-    Inst as IRInst, InstructionData, LibCall, Opcode, Signature, Type,
+    Inst as IRInst, InstructionData, LibCall, Opcode, Signature, SourceLoc, Type,
 };
 use crate::isa::x64::abi::*;
 use crate::isa::x64::inst::args::*;
@@ -227,6 +227,7 @@ fn emit_insert_lane<C: LowerCtx<I = Inst>>(
     dst: Writable<Reg>,
     lane: u8,
     ty: Type,
+    srcloc: Option<SourceLoc>,
 ) {
     if !ty.is_float() {
         let (sse_op, is64) = match ty.lane_bits() {
@@ -236,13 +237,13 @@ fn emit_insert_lane<C: LowerCtx<I = Inst>>(
             64 => (SseOpcode::Pinsrd, true),
             _ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()),
         };
-        ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64));
+        ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64, srcloc));
     } else if ty == types::F32 {
         let sse_op = SseOpcode::Insertps;
         // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
         // shifted into bits 5:6).
         let lane = 0b00_00_00_00 | lane << 4;
-        ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false));
+        ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false, srcloc));
     } else if ty == types::F64 {
         let sse_op = match lane {
             // Move the lowest quadword in replacement to vector without changing
@@ -256,7 +257,7 @@ fn emit_insert_lane<C: LowerCtx<I = Inst>>(
         // Here we use the `xmm_rm_r` encoding because it correctly tells the register
         // allocator how we are using `dst`: we are using `dst` as a `mod` whereas other
         // encoding formats like `xmm_unary_rm_r` treat it as a `def`.
-        ctx.emit(Inst::xmm_rm_r(sse_op, src, dst));
+        ctx.emit(Inst::xmm_rm_r(sse_op, src, dst, srcloc));
     } else {
         panic!("unable to emit insertlane for type: {}", ty)
     }
@@ -694,6 +695,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                                 SseOpcode::Pmuludq,
                                 RegMem::reg(lhs.clone()),
                                 rhs_1,
+                                None,
                             ));
 
                             // B' = B
@@ -707,7 +709,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                                 RegMemImm::imm(32),
                                 lhs_1,
                             ));
-                            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(rhs), lhs_1));
+                            ctx.emit(Inst::xmm_rm_r(
+                                SseOpcode::Pmuludq,
+                                RegMem::reg(rhs),
+                                lhs_1,
+                                None,
+                            ));
 
                             // B' = B' + A'
                             // B' = B' << 32
@@ -715,6 +722,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                                 SseOpcode::Paddq,
                                 RegMem::reg(rhs_1.to_reg()),
                                 lhs_1,
+                                None,
                             ));
                             ctx.emit(Inst::xmm_rmi_reg(
                                 SseOpcode::Psllq,
@@ -731,11 +739,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                                 SseOpcode::Pmuludq,
                                 RegMem::reg(lhs.clone()),
                                 rhs_1,
+                                None,
                             ));
                             ctx.emit(Inst::xmm_rm_r(
                                 SseOpcode::Paddq,
                                 RegMem::reg(lhs_1.to_reg()),
                                 rhs_1,
+                                None,
                             ));
                             ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty));
                             return Ok(());
@@ -770,7 +780,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
                 // Move the `lhs` to the same register as `dst`.
                 ctx.emit(Inst::gen_move(dst, lhs, ty));
-                ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
+                ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst, None));
             } else {
                 let is_64 = ty == types::I64;
                 let alu_op = match op {
@@ -828,7 +838,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // Note the flipping of operands: the `rhs` operand is used as the destination instead
             // of the `lhs` as in the other bit operations above (e.g. `band`).
             ctx.emit(Inst::gen_move(dst, rhs, ty));
-            ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst));
+            ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst, None));
         }
 
         Opcode::Iabs => {
@@ -884,7 +894,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
                 // Move the `lhs` to the same register as `dst`.
                 ctx.emit(Inst::gen_move(dst, lhs, ty));
-                ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
+                ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst, None));
             } else {
                 panic!("Unsupported type for {} instruction: {}", op, ty);
             }
@@ -1007,8 +1017,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     SseOpcode::Pxor,
                     RegMem::reg(tmp.to_reg()),
                     tmp,
+                    None,
                 ));
-                ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp));
+                ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp, None));
                 ctx.emit(Inst::xmm_unary_rm_r(
                     SseOpcode::Movapd,
                     RegMem::reg(tmp.to_reg()),
@@ -1561,34 +1572,44 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 };
 
                 match condcode {
-                    IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)),
+                    IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None)),
                     IntCC::NotEqual => {
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None));
                         // Emit all 1s into the `tmp` register.
                         let tmp = ctx.alloc_tmp(RegClass::V128, ty);
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp, None));
                         // Invert the result of the `PCMPEQ*`.
-                        ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
+                        ctx.emit(Inst::xmm_rm_r(
+                            SseOpcode::Pxor,
+                            RegMem::from(tmp),
+                            dst,
+                            None,
+                        ));
                     }
                     IntCC::SignedGreaterThan | IntCC::SignedLessThan => {
-                        ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst))
+                        ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst, None))
                     }
                     IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual => {
-                        ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst));
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
+                        ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst, None));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None))
                     }
                     IntCC::UnsignedGreaterThan | IntCC::UnsignedLessThan => {
-                        ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst));
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
+                        ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst, None));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None));
                         // Emit all 1s into the `tmp` register.
                         let tmp = ctx.alloc_tmp(RegClass::V128, ty);
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp, None));
                         // Invert the result of the `PCMPEQ*`.
-                        ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
+                        ctx.emit(Inst::xmm_rm_r(
+                            SseOpcode::Pxor,
+                            RegMem::from(tmp),
+                            dst,
+                            None,
+                        ));
                     }
                     IntCC::UnsignedGreaterThanOrEqual | IntCC::UnsignedLessThanOrEqual => {
-                        ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst));
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
+                        ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst, None));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None))
                     }
                     _ => unimplemented!("Unimplemented comparison code for icmp: {}", condcode),
                 }
@@ -1686,7 +1707,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 ctx.emit(Inst::gen_move(dst, lhs, input_ty));
 
                 // Emit the comparison.
-                ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode(), false));
+                ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode(), false, None));
             }
         }
 
@@ -1899,7 +1920,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     ty
                 ),
             };
-            ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
+            ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst, None));
         }
 
         Opcode::Fmin | Opcode::Fmax => {
@@ -1988,15 +2009,15 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1, None));
 
                     // Perform min in reverse direction
-                    ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1));
+                    ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1, None));
 
                     // Perform min in original direction
-                    ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst));
+                    ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst, None));
 
                     // X64 handles propagation of -0's and Nans differently between left and right
                     // operands. After doing the min in both directions, this OR will
                     // guarrentee capture of -0's and Nan in our tmp register
-                    ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1));
+                    ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1, None));
 
                     // Compare unordered to create mask for lanes containing NaNs and then use
                     // that mask to saturate the NaN containing lanes in the tmp register with 1s.
@@ -2009,8 +2030,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         dst,
                         cond.encode(),
                         false,
+                        None,
+                    ));
+                    ctx.emit(Inst::xmm_rm_r(
+                        or_op,
+                        RegMem::reg(dst.to_reg()),
+                        tmp_xmm1,
+                        None,
                     ));
-                    ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
 
                     // The dst register holds a mask for lanes containing NaNs.
                     // We take that mask and shift in preparation for creating a different mask
@@ -2022,7 +2049,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
                     // Finally we do a nand with the tmp register to produce the final results
                     // in the dst.
-                    ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
+                    ctx.emit(Inst::xmm_rm_r(
+                        andn_op,
+                        RegMem::reg(tmp_xmm1.to_reg()),
+                        dst,
+                        None,
+                    ));
                 } else {
                     let (
                         mov_op,
@@ -2065,23 +2097,43 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1, None));
 
                     // Perform max in reverse direction.
-                    ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+                    ctx.emit(Inst::xmm_rm_r(
+                        max_op,
+                        RegMem::reg(dst.to_reg()),
+                        tmp_xmm1,
+                        None,
+                    ));
 
                     // Perform max in original direction.
-                    ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst));
+                    ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst, None));
 
                     // Get the difference between the two results and store in tmp.
                     // Max uses a different approach than min to account for potential
                     // discrepancies with plus/minus 0.
-                    ctx.emit(Inst::xmm_rm_r(xor_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
+                    ctx.emit(Inst::xmm_rm_r(
+                        xor_op,
+                        RegMem::reg(tmp_xmm1.to_reg()),
+                        dst,
+                        None,
+                    ));
 
                     // X64 handles propagation of -0's and Nans differently between left and right
                     // operands. After doing the max in both directions, this OR will
                     // guarentee capture of 0's and Nan in our tmp register.
-                    ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+                    ctx.emit(Inst::xmm_rm_r(
+                        or_op,
+                        RegMem::reg(dst.to_reg()),
+                        tmp_xmm1,
+                        None,
+                    ));
 
                     // Capture NaNs and sign discrepancies.
-                    ctx.emit(Inst::xmm_rm_r(sub_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+                    ctx.emit(Inst::xmm_rm_r(
+                        sub_op,
+                        RegMem::reg(dst.to_reg()),
+                        tmp_xmm1,
+                        None,
+                    ));
 
                     // Compare unordered to create mask for lanes containing NaNs and then use
                     // that mask to saturate the NaN containing lanes in the tmp register with 1s.
@@ -2092,6 +2144,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         dst,
                         cond.encode(),
                         false,
+                        None,
                     ));
 
                     // The dst register holds a mask for lanes containing NaNs.
@@ -2104,7 +2157,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
                     // Finally we do a nand with the tmp register to produce the final results
                     // in the dst.
-                    ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
+                    ctx.emit(Inst::xmm_rm_r(
+                        andn_op,
+                        RegMem::reg(tmp_xmm1.to_reg()),
+                        dst,
+                        None,
+                    ));
                 }
             }
         }
@@ -2327,7 +2385,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     ctx.emit(inst);
                 }
 
-                ctx.emit(Inst::xmm_rm_r(opcode, src, dst));
+                ctx.emit(Inst::xmm_rm_r(opcode, src, dst, None));
             } else {
                 // Eventually vector constants should be available in `gen_constant` and this block
                 // can be merged with the one above (TODO).
@@ -2348,6 +2406,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         tmp,
                         cond.encode(),
                         false,
+                        None,
                     );
                     ctx.emit(cmpps);
 
@@ -2367,7 +2426,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     ctx.emit(shift);
 
                     // Apply shifted mask (XOR or AND).
-                    let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst);
+                    let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst, None);
                     ctx.emit(mask);
                 } else {
                     panic!("unexpected type {:?} for Fabs", output_ty);
@@ -2426,14 +2485,20 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 dst,
                 None,
             ));
-            ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst));
+            ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst, None));
             ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(rhs), tmp_xmm2, None));
             ctx.emit(Inst::xmm_rm_r(
                 and_op,
                 RegMem::reg(tmp_xmm1.to_reg()),
                 tmp_xmm2,
+                None,
+            ));
+            ctx.emit(Inst::xmm_rm_r(
+                or_op,
+                RegMem::reg(tmp_xmm2.to_reg()),
+                dst,
+                None,
             ));
-            ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(tmp_xmm2.to_reg()), dst));
         }
 
         Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => {
@@ -3154,7 +3219,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // After loading the constructed mask in a temporary register, we use this to
                 // shuffle the `dst` register (remember that, in this case, it is the same as
                 // `src` so we disregard this register).
-                ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Pshufb,
+                    RegMem::from(tmp),
+                    dst,
+                    None,
+                ));
             } else {
                 // If `lhs` and `rhs` are different, we must shuffle each separately and then OR
                 // them together. This is necessary due to PSHUFB semantics. As in the case above,
@@ -3166,7 +3236,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
                 let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
                 ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp1, ty));
-                ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0));
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Pshufb,
+                    RegMem::from(tmp1),
+                    tmp0,
+                    None,
+                ));
 
                 // PSHUFB the second argument, placing zeroes for unused lanes.
                 let constructed_mask = mask
@@ -3176,11 +3251,21 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     .collect();
                 let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
                 ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp2, ty));
-                ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst));
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Pshufb,
+                    RegMem::from(tmp2),
+                    dst,
+                    None,
+                ));
 
                 // OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
                 // is not important).
-                ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Orps,
+                    RegMem::from(tmp0),
+                    dst,
+                    None,
+                ));
 
                 // TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
             }
@@ -3214,6 +3299,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 SseOpcode::Paddusb,
                 RegMem::from(zero_mask),
                 swizzle_mask,
+                None,
             ));
 
             // Shuffle `dst` using the fixed-up `swizzle_mask`.
@@ -3221,6 +3307,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 SseOpcode::Pshufb,
                 RegMem::from(swizzle_mask),
                 dst,
+                None,
             ));
         }
 
@@ -3240,7 +3327,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             debug_assert!(lane < ty.lane_count() as u8);
 
             ctx.emit(Inst::gen_move(dst, in_vec, ty));
-            emit_insert_lane(ctx, src, dst, lane, ty.lane_type());
+            emit_insert_lane(ctx, src, dst, lane, ty.lane_type(), None);
         }
 
         Opcode::Extractlane => {
@@ -3266,7 +3353,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()),
                 };
                 let src = RegMem::reg(src);
-                ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit));
+                ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit, None));
             } else {
                 if lane == 0 {
                     // Remove the extractlane instruction, leaving the float where it is. The upper
@@ -3288,35 +3375,57 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         _ => unreachable!(),
                     };
                     let src = RegMem::reg(src);
-                    ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false));
+                    ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false, None));
                 }
             }
         }
 
-        Opcode::Splat => {
+        Opcode::Splat | Opcode::LoadSplat => {
             let ty = ty.unwrap();
             assert_eq!(ty.bits(), 128);
             let src_ty = ctx.input_ty(insn, 0);
             assert!(src_ty.bits() < 128);
-            let src = input_to_reg_mem(ctx, inputs[0]);
+
+            let (src, srcloc) = match op {
+                Opcode::Splat => (input_to_reg_mem(ctx, inputs[0]), None),
+                Opcode::LoadSplat => {
+                    let offset = ctx.data(insn).load_store_offset().unwrap();
+                    let amode = lower_to_amode(ctx, inputs[0], offset);
+                    (RegMem::mem(amode), Some(ctx.srcloc(insn)))
+                }
+                _ => unreachable!(),
+            };
             let dst = get_output_reg(ctx, outputs[0]);
 
             // We know that splat will overwrite all of the lanes of `dst` but it takes several
             // instructions to do so. Because of the multiple instructions, there is no good way to
             // declare `dst` a `def` except with the following pseudo-instruction.
             ctx.emit(Inst::xmm_uninit_value(dst));
+
+            // TODO: eventually many of these sequences could be optimized with AVX's VBROADCAST*
+            // and VPBROADCAST*.
             match ty.lane_bits() {
                 8 => {
-                    emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
+                    emit_insert_lane(ctx, src, dst, 0, ty.lane_type(), srcloc);
                     // Initialize a register with all 0s.
                     let tmp = ctx.alloc_tmp(RegClass::V128, ty);
-                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Pxor,
+                        RegMem::from(tmp),
+                        tmp,
+                        srcloc,
+                    ));
                     // Shuffle the lowest byte lane to all other lanes.
-                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst))
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Pshufb,
+                        RegMem::from(tmp),
+                        dst,
+                        srcloc,
+                    ))
                 }
                 16 => {
-                    emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
-                    emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
+                    emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type(), srcloc);
+                    emit_insert_lane(ctx, src, dst, 1, ty.lane_type(), srcloc);
                     // Shuffle the lowest two lanes to all other lanes.
                     ctx.emit(Inst::xmm_rm_r_imm(
                         SseOpcode::Pshufd,
@@ -3324,10 +3433,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         dst,
                         0,
                         false,
+                        srcloc,
                     ))
                 }
                 32 => {
-                    emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
+                    emit_insert_lane(ctx, src, dst, 0, ty.lane_type(), srcloc);
                     // Shuffle the lowest lane to all other lanes.
                     ctx.emit(Inst::xmm_rm_r_imm(
                         SseOpcode::Pshufd,
@@ -3335,11 +3445,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         dst,
                         0,
                         false,
+                        srcloc,
                     ))
                 }
                 64 => {
-                    emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
-                    emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
+                    emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type(), srcloc);
+                    emit_insert_lane(ctx, src, dst, 1, ty.lane_type(), srcloc);
                 }
                 _ => panic!("Invalid type to splat: {}", ty),
             }
@@ -3373,9 +3484,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
             // Initialize a register with all 0s.
             let tmp = ctx.alloc_tmp(RegClass::V128, ty);
-            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
+            ctx.emit(Inst::xmm_rm_r(
+                SseOpcode::Pxor,
+                RegMem::from(tmp),
+                tmp,
+                None,
+            ));
             // Compare to see what lanes are filled with all 1s.
-            ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp));
+            ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp, None));
             // Set the ZF if the result is all zeroes.
             ctx.emit(Inst::xmm_cmp_rm_r(
                 SseOpcode::Ptest,
diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs
index 72890cffd9f0..976f1581e3cf 100644
--- a/cranelift/codegen/src/isa/x86/enc_tables.rs
+++ b/cranelift/codegen/src/isa/x86/enc_tables.rs
@@ -1892,3 +1892,31 @@ fn expand_tls_value(
         unreachable!();
     }
 }
+
+fn expand_load_splat(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+
+    pos.use_srcloc(inst);
+
+    let (ptr, offset, flags) = match pos.func.dfg[inst] {
+        ir::InstructionData::Load {
+            opcode: ir::Opcode::LoadSplat,
+            arg,
+            offset,
+            flags,
+        } => (arg, offset, flags),
+        _ => panic!(
+            "Expected load_splat: {}",
+            pos.func.dfg.display_inst(inst, None)
+        ),
+    };
+    let ty = pos.func.dfg.ctrl_typevar(inst);
+    let load = pos.ins().load(ty.lane_type(), flags, ptr, offset);
+
+    pos.func.dfg.replace(inst).splat(ty, load);
+}
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
index 7c827802ba4a..ef1804cf12b5 100644
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -1380,19 +1380,17 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         | Operator::V128Load16Splat { memarg }
         | Operator::V128Load32Splat { memarg }
         | Operator::V128Load64Splat { memarg } => {
-            // TODO: For spec compliance, this is initially implemented as a combination of `load +
-            // splat` but could be implemented eventually as a single instruction (`load_splat`).
-            // See https://github.com/bytecodealliance/wasmtime/issues/1175.
-            translate_load(
+            let opcode = ir::Opcode::LoadSplat;
+            let result_ty = type_of(op);
+            let (flags, base, offset) = prepare_load(
                 memarg,
-                ir::Opcode::Load,
-                type_of(op).lane_type(),
+                mem_op_size(opcode, result_ty.lane_type()),
                 builder,
                 state,
                 environ,
             )?;
-            let splatted = builder.ins().splat(type_of(op), state.pop1());
-            state.push1(splatted)
+            let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base);
+            state.push1(dfg.first_result(load))
         }
         Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => {
             let vector = pop1_with_bitcast(state, type_of(op), builder);
@@ -2040,7 +2038,7 @@ fn mem_op_size(opcode: ir::Opcode, ty: Type) -> u32 {
         ir::Opcode::Istore8 | ir::Opcode::Sload8 | ir::Opcode::Uload8 => 1,
         ir::Opcode::Istore16 | ir::Opcode::Sload16 | ir::Opcode::Uload16 => 2,
         ir::Opcode::Istore32 | ir::Opcode::Sload32 | ir::Opcode::Uload32 => 4,
-        ir::Opcode::Store | ir::Opcode::Load => ty.bytes(),
+        ir::Opcode::Store | ir::Opcode::Load | ir::Opcode::LoadSplat => ty.bytes(),
         _ => panic!("unknown size of mem op for {:?}", opcode),
     }
 }