Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce the Cranelift IR instruction LoadSplat #2278

Merged
merged 3 commits into from
Oct 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cranelift/codegen/meta/src/isa/x86/legalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,7 @@ fn define_simd(
let insertlane = insts.by_name("insertlane");
let ishl = insts.by_name("ishl");
let ishl_imm = insts.by_name("ishl_imm");
let load_splat = insts.by_name("load_splat");
let raw_bitcast = insts.by_name("raw_bitcast");
let scalar_to_vector = insts.by_name("scalar_to_vector");
let splat = insts.by_name("splat");
Expand Down Expand Up @@ -820,6 +821,7 @@ fn define_simd(
narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");
narrow.custom_legalize(fmin, "expand_minmax_vector");
narrow.custom_legalize(fmax, "expand_minmax_vector");
narrow.custom_legalize(load_splat, "expand_load_splat");

narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
Expand Down
19 changes: 19 additions & 0 deletions cranelift/codegen/meta/src/shared/instructions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4409,5 +4409,24 @@ pub(crate) fn define(
.other_side_effects(true),
);

let Offset = &Operand::new("Offset", &imm.offset32).with_doc("Byte offset from base address");
let a = &Operand::new("a", TxN);

ig.push(
Inst::new(
"load_splat",
r#"
Load an element from memory at ``p + Offset`` and return a vector
whose lanes are all set to that element.

This is equivalent to ``load`` followed by ``splat``.
"#,
&formats.load,
)
.operands_in(vec![MemFlags, p, Offset])
.operands_out(vec![a])
.can_load(true),
);

ig.build()
}
15 changes: 15 additions & 0 deletions cranelift/codegen/src/isa/aarch64/inst/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -680,4 +680,19 @@ impl VectorSize {
_ => *self,
}
}

/// Return the encoding bits that are used by some SIMD instructions
/// for a particular operand size.
pub fn enc_size(&self) -> (u32, u32) {
let q = self.is_128bits() as u32;
let size = match self.lane_size() {
ScalarSize::Size8 => 0b00,
ScalarSize::Size16 => 0b01,
ScalarSize::Size32 => 0b10,
ScalarSize::Size64 => 0b11,
_ => unreachable!(),
};

(q, size)
}
}
145 changes: 87 additions & 58 deletions cranelift/codegen/src/isa/aarch64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,16 @@ fn enc_ldst_imm19(op_31_24: u32, imm19: u32, rd: Reg) -> u32 {
(op_31_24 << 24) | (imm19 << 5) | machreg_to_gpr_or_vec(rd)
}

fn enc_ldst_vec(q: u32, size: u32, rn: Reg, rt: Writable<Reg>) -> u32 {
debug_assert_eq!(q & 0b1, q);
debug_assert_eq!(size & 0b11, size);
0b0_0_0011010_10_00000_110_0_00_00000_00000
| q << 30
| size << 10
| machreg_to_gpr(rn) << 5
| machreg_to_vec(rt.to_reg())
}

fn enc_extend(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
(top22 << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())
}
Expand Down Expand Up @@ -1380,14 +1390,7 @@ impl MachInstEmit for Inst {
sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
}
&Inst::VecMisc { op, rd, rn, size } => {
let enc_size = match size.lane_size() {
ScalarSize::Size8 => 0b00,
ScalarSize::Size16 => 0b01,
ScalarSize::Size32 => 0b10,
ScalarSize::Size64 => 0b11,
_ => unreachable!(),
};
let q = if size.is_128bits() { 1 } else { 0 };
let (q, enc_size) = size.enc_size();
let (u, bits_12_16, size) = match op {
VecMisc2::Not => (0b1, 0b00101, 0b00),
VecMisc2::Neg => (0b1, 0b01011, enc_size),
Expand Down Expand Up @@ -1756,13 +1759,7 @@ impl MachInstEmit for Inst {
alu_op,
size,
} => {
let enc_size = match size.lane_size() {
ScalarSize::Size8 => 0b00,
ScalarSize::Size16 => 0b01,
ScalarSize::Size32 => 0b10,
ScalarSize::Size64 => 0b11,
_ => unreachable!(),
};
let (q, enc_size) = size.enc_size();
let is_float = match alu_op {
VecALUOp::Fcmeq
| VecALUOp::Fcmgt
Expand All @@ -1776,65 +1773,81 @@ impl MachInstEmit for Inst {
_ => false,
};
let enc_float_size = match (is_float, size) {
(true, VectorSize::Size32x2) => 0b0,
(true, VectorSize::Size32x4) => 0b0,
(true, VectorSize::Size64x2) => 0b1,
(true, _) => unimplemented!(),
_ => 0,
};

let (top11, bit15_10) = match alu_op {
VecALUOp::Sqadd => (0b010_01110_00_1 | enc_size << 1, 0b000011),
VecALUOp::Sqsub => (0b010_01110_00_1 | enc_size << 1, 0b001011),
VecALUOp::Uqadd => (0b011_01110_00_1 | enc_size << 1, 0b000011),
VecALUOp::Uqsub => (0b011_01110_00_1 | enc_size << 1, 0b001011),
VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011),
VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111),
VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101),
VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size << 1, 0b001101),
VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size << 1, 0b001111),
VecALUOp::Fcmeq => (0b010_01110_00_1, 0b111001),
VecALUOp::Fcmgt => (0b011_01110_10_1, 0b111001),
VecALUOp::Fcmge => (0b011_01110_00_1, 0b111001),
VecALUOp::Sqadd => (0b000_01110_00_1 | enc_size << 1, 0b000011),
VecALUOp::Sqsub => (0b000_01110_00_1 | enc_size << 1, 0b001011),
VecALUOp::Uqadd => (0b001_01110_00_1 | enc_size << 1, 0b000011),
VecALUOp::Uqsub => (0b001_01110_00_1 | enc_size << 1, 0b001011),
VecALUOp::Cmeq => (0b001_01110_00_1 | enc_size << 1, 0b100011),
VecALUOp::Cmge => (0b000_01110_00_1 | enc_size << 1, 0b001111),
VecALUOp::Cmgt => (0b000_01110_00_1 | enc_size << 1, 0b001101),
VecALUOp::Cmhi => (0b001_01110_00_1 | enc_size << 1, 0b001101),
VecALUOp::Cmhs => (0b001_01110_00_1 | enc_size << 1, 0b001111),
VecALUOp::Fcmeq => (0b000_01110_00_1, 0b111001),
VecALUOp::Fcmgt => (0b001_01110_10_1, 0b111001),
VecALUOp::Fcmge => (0b001_01110_00_1, 0b111001),
// The following logical instructions operate on bytes, so are not encoded differently
// for the different vector types.
VecALUOp::And => (0b010_01110_00_1, 0b000111),
VecALUOp::Bic => (0b010_01110_01_1, 0b000111),
VecALUOp::Orr => (0b010_01110_10_1, 0b000111),
VecALUOp::Eor => (0b011_01110_00_1, 0b000111),
VecALUOp::Bsl => (0b011_01110_01_1, 0b000111),
VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001),
VecALUOp::Add => (0b010_01110_00_1 | enc_size << 1, 0b100001),
VecALUOp::Sub => (0b011_01110_00_1 | enc_size << 1, 0b100001),
VecALUOp::And => (0b000_01110_00_1, 0b000111),
VecALUOp::Bic => (0b000_01110_01_1, 0b000111),
VecALUOp::Orr => (0b000_01110_10_1, 0b000111),
VecALUOp::Eor => (0b001_01110_00_1, 0b000111),
VecALUOp::Bsl => (0b001_01110_01_1, 0b000111),
VecALUOp::Umaxp => (0b001_01110_00_1 | enc_size << 1, 0b101001),
VecALUOp::Add => (0b000_01110_00_1 | enc_size << 1, 0b100001),
VecALUOp::Sub => (0b001_01110_00_1 | enc_size << 1, 0b100001),
VecALUOp::Mul => {
debug_assert_ne!(size, VectorSize::Size64x2);
(0b010_01110_00_1 | enc_size << 1, 0b100111)
(0b000_01110_00_1 | enc_size << 1, 0b100111)
}
VecALUOp::Sshl => (0b010_01110_00_1 | enc_size << 1, 0b010001),
VecALUOp::Ushl => (0b011_01110_00_1 | enc_size << 1, 0b010001),
VecALUOp::Umin => (0b011_01110_00_1 | enc_size << 1, 0b011011),
VecALUOp::Smin => (0b010_01110_00_1 | enc_size << 1, 0b011011),
VecALUOp::Umax => (0b011_01110_00_1 | enc_size << 1, 0b011001),
VecALUOp::Smax => (0b010_01110_00_1 | enc_size << 1, 0b011001),
VecALUOp::Urhadd => (0b011_01110_00_1 | enc_size << 1, 0b000101),
VecALUOp::Fadd => (0b010_01110_00_1, 0b110101),
VecALUOp::Fsub => (0b010_01110_10_1, 0b110101),
VecALUOp::Fdiv => (0b011_01110_00_1, 0b111111),
VecALUOp::Fmax => (0b010_01110_00_1, 0b111101),
VecALUOp::Fmin => (0b010_01110_10_1, 0b111101),
VecALUOp::Fmul => (0b011_01110_00_1, 0b110111),
VecALUOp::Addp => (0b010_01110_00_1 | enc_size << 1, 0b101111),
VecALUOp::Sshl => (0b000_01110_00_1 | enc_size << 1, 0b010001),
VecALUOp::Ushl => (0b001_01110_00_1 | enc_size << 1, 0b010001),
VecALUOp::Umin => (0b001_01110_00_1 | enc_size << 1, 0b011011),
VecALUOp::Smin => (0b000_01110_00_1 | enc_size << 1, 0b011011),
VecALUOp::Umax => (0b001_01110_00_1 | enc_size << 1, 0b011001),
VecALUOp::Smax => (0b000_01110_00_1 | enc_size << 1, 0b011001),
VecALUOp::Urhadd => (0b001_01110_00_1 | enc_size << 1, 0b000101),
VecALUOp::Fadd => (0b000_01110_00_1, 0b110101),
VecALUOp::Fsub => (0b000_01110_10_1, 0b110101),
VecALUOp::Fdiv => (0b001_01110_00_1, 0b111111),
VecALUOp::Fmax => (0b000_01110_00_1, 0b111101),
VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
VecALUOp::Umlal => {
debug_assert!(!size.is_128bits());
(0b001_01110_00_1 | enc_size << 1, 0b100000)
}
};
let top11 = if is_float {
top11 | enc_float_size << 1
top11 | (q << 9) | enc_float_size << 1
} else {
top11
top11 | (q << 9)
};
sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
}
&Inst::VecLoadReplicate {
rd,
rn,
size,
srcloc,
} => {
let (q, size) = size.enc_size();

if let Some(srcloc) = srcloc {
// Register the offset at which the actual load instruction starts.
sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
}

sink.put4(enc_ldst_vec(q, size, rn, rd));
}
&Inst::MovToNZCV { rn } => {
sink.put4(0xd51b4200 | machreg_to_gpr(rn));
}
Expand Down Expand Up @@ -2119,9 +2132,12 @@ impl MachInstEmit for Inst {
inst.emit(sink, emit_info, state);
}

let (reg, offset) = match mem {
AMode::Unscaled(r, simm9) => (r, simm9.value()),
AMode::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32),
let (reg, index_reg, offset) = match mem {
AMode::RegExtended(r, idx, extendop) => (r, Some((idx, extendop)), 0),
AMode::Unscaled(r, simm9) => (r, None, simm9.value()),
AMode::UnsignedOffset(r, uimm12scaled) => {
(r, None, uimm12scaled.value() as i32)
}
_ => panic!("Unsupported case for LoadAddr: {:?}", mem),
};
let abs_offset = if offset < 0 {
Expand All @@ -2135,9 +2151,22 @@ impl MachInstEmit for Inst {
ALUOp::Add64
};

if offset == 0 {
let mov = Inst::mov(rd, reg);
mov.emit(sink, emit_info, state);
if let Some((idx, extendop)) = index_reg {
let add = Inst::AluRRRExtend {
alu_op: ALUOp::Add64,
rd,
rn: reg,
rm: idx,
extendop,
};

add.emit(sink, emit_info, state);
} else if offset == 0 {
if reg != rd.to_reg() {
let mov = Inst::mov(rd, reg);

mov.emit(sink, emit_info, state);
}
} else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
let add = Inst::AluRRImm12 {
alu_op,
Expand Down
34 changes: 28 additions & 6 deletions cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2533,10 +2533,10 @@ fn test_aarch64_binemit() {
rd: writable_vreg(28),
rn: vreg(12),
rm: vreg(4),
size: VectorSize::Size32x4,
size: VectorSize::Size32x2,
},
"9CE5244E",
"fcmeq v28.4s, v12.4s, v4.4s",
"9CE5240E",
"fcmeq v28.2s, v12.2s, v4.2s",
));

insns.push((
Expand Down Expand Up @@ -2965,10 +2965,10 @@ fn test_aarch64_binemit() {
rd: writable_vreg(6),
rn: vreg(9),
rm: vreg(8),
size: VectorSize::Size8x16,
size: VectorSize::Size8x8,
},
"2665286E",
"umax v6.16b, v9.16b, v8.16b",
"2665282E",
"umax v6.8b, v9.8b, v8.8b",
));

insns.push((
Expand Down Expand Up @@ -3507,6 +3507,28 @@ fn test_aarch64_binemit() {
"tbx v3.16b, { v11.16b, v12.16b }, v19.16b",
));

insns.push((
Inst::VecLoadReplicate {
rd: writable_vreg(31),
rn: xreg(0),
srcloc: None,
size: VectorSize::Size64x2,
},
"1FCC404D",
"ld1r { v31.2d }, [x0]",
));

insns.push((
Inst::VecLoadReplicate {
rd: writable_vreg(0),
rn: xreg(25),
srcloc: None,
size: VectorSize::Size8x8,
},
"20C3400D",
"ld1r { v0.8b }, [x25]",
));

insns.push((
Inst::Extend {
rd: writable_xreg(1),
Expand Down
30 changes: 28 additions & 2 deletions cranelift/codegen/src/isa/aarch64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -975,6 +975,14 @@ pub enum Inst {
is_extension: bool,
},

/// Load an element and replicate to all lanes of a vector.
VecLoadReplicate {
rd: Writable<Reg>,
rn: Reg,
size: VectorSize,
srcloc: Option<SourceLoc>,
},

/// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn).
MovToNZCV {
rn: Reg,
Expand Down Expand Up @@ -1609,7 +1617,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
collector.add_def(rd);
}
}

&Inst::VecLoadReplicate { rd, rn, .. } => {
collector.add_def(rd);
collector.add_use(rn);
}
&Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => {
collector.add_use(rn);
collector.add_use(rm);
Expand Down Expand Up @@ -1762,8 +1773,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
&Inst::LoadExtName { rd, .. } => {
collector.add_def(rd);
}
&Inst::LoadAddr { rd, mem: _ } => {
&Inst::LoadAddr { rd, ref mem } => {
collector.add_def(rd);
memarg_regs(mem, collector);
}
&Inst::VirtualSPOffsetAdj { .. } => {}
&Inst::EmitIsland { .. } => {}
Expand Down Expand Up @@ -2189,6 +2201,14 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
map_def(mapper, rd);
}
}
&mut Inst::VecLoadReplicate {
ref mut rd,
ref mut rn,
..
} => {
map_def(mapper, rd);
map_use(mapper, rn);
}
&mut Inst::FpuCmp32 {
ref mut rn,
ref mut rm,
Expand Down Expand Up @@ -3412,6 +3432,12 @@ impl Inst {
let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16);
format!("{} {}, {{ {}, {} }}, {}", op, rd, rn, rn2, rm)
}
&Inst::VecLoadReplicate { rd, rn, size, .. } => {
let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
let rn = rn.show_rru(mb_rru);

format!("ld1r {{ {} }}, [{}]", rd, rn)
}
&Inst::MovToNZCV { rn } => {
let rn = rn.show_rru(mb_rru);
format!("msr nzcv, {}", rn)
Expand Down
Loading