Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CL/aarch64: implement the wasm SIMD v128.load{32,64}_zero instructi… #2355

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions cranelift/codegen/meta/src/shared/instructions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3798,12 +3798,9 @@ pub(crate) fn define(
Inst::new(
"scalar_to_vector",
r#"
Scalar To Vector -- move a value out of a scalar register and into a vector register; the
scalar will be moved to the lowest-order bits of the vector register. Note that this
instruction is intended as a low-level legalization instruction and frontends should prefer
insertlane; on certain architectures, scalar_to_vector may zero the highest-order bits for some
types (e.g. integers) but not for others (e.g. floats).
"#,
Copies a scalar value to a vector value. The scalar is copied into the
least significant lane of the vector, and all other lanes will be zero.
"#,
&formats.unary,
)
.operands_in(vec![s])
Expand Down
9 changes: 9 additions & 0 deletions cranelift/codegen/src/isa/aarch64/inst/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,15 @@ impl ScalarSize {
}
}

/// Convert to an integer operand size.
pub fn operand_size(&self) -> OperandSize {
match self {
ScalarSize::Size32 => OperandSize::Size32,
ScalarSize::Size64 => OperandSize::Size64,
_ => panic!("Unexpected operand_size request for: {:?}", self),
}
}

/// Convert from a type into the smallest size that fits.
pub fn from_ty(ty: Type) -> ScalarSize {
Self::from_bits(ty_bits(ty))
Expand Down
13 changes: 7 additions & 6 deletions cranelift/codegen/src/isa/aarch64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1651,12 +1651,13 @@ impl MachInstEmit for Inst {
};
sink.put4(enc_fround(top22, rd, rn));
}
&Inst::MovToFpu { rd, rn } => {
sink.put4(
0b100_11110_01_1_00_111_000000_00000_00000
| (machreg_to_gpr(rn) << 5)
| machreg_to_vec(rd.to_reg()),
);
&Inst::MovToFpu { rd, rn, size } => {
let template = match size {
ScalarSize::Size32 => 0b000_11110_00_1_00_111_000000_00000_00000,
ScalarSize::Size64 => 0b100_11110_01_1_00_111_000000_00000_00000,
_ => unreachable!(),
};
sink.put4(template | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg()));
}
&Inst::MovToVec { rd, rn, idx, size } => {
let (imm5, shift) = match size.lane_size() {
Expand Down
10 changes: 10 additions & 0 deletions cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1860,10 +1860,20 @@ fn test_aarch64_binemit() {
Inst::MovToFpu {
rd: writable_vreg(31),
rn: xreg(0),
size: ScalarSize::Size64,
},
"1F00679E",
"fmov d31, x0",
));
insns.push((
Inst::MovToFpu {
rd: writable_vreg(1),
rn: xreg(28),
size: ScalarSize::Size32,
},
"8103271E",
"fmov s1, w28",
));
insns.push((
Inst::MovToVec {
rd: writable_vreg(0),
Expand Down
26 changes: 17 additions & 9 deletions cranelift/codegen/src/isa/aarch64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -877,10 +877,13 @@ pub enum Inst {
rn: Reg,
},

/// Move from a GPR to a scalar FP register.
/// Move from a GPR to a vector register. The scalar value is parked in the lowest lane
Copy link
Contributor

@akirilov-arm akirilov-arm Nov 4, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW I don't mind the comment at all, but this operation is not special - virtually any instruction that operates on S or D registers (e.g. Inst::FpuRR) has exactly the same behaviour.

/// of the destination, and all other lanes are zeroed out. Currently only 32- and 64-bit
/// transactions are supported.
MovToFpu {
rd: Writable<Reg>,
rn: Reg,
size: ScalarSize,
},

/// Move to a vector element from a GPR.
Expand Down Expand Up @@ -1319,13 +1322,15 @@ impl Inst {
size: VectorSize::Size8x8
}]
} else {
// TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent bits.
// TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent
// bits.
let tmp = alloc_tmp(RegClass::I64, I32);
let mut insts = Inst::load_constant(tmp, value as u64);

insts.push(Inst::MovToFpu {
rd,
rn: tmp.to_reg(),
size: ScalarSize::Size64,
});

insts
Expand All @@ -1340,16 +1345,17 @@ impl Inst {
) -> SmallVec<[Inst; 4]> {
if let Ok(const_data) = u32::try_from(const_data) {
Inst::load_fp_constant32(rd, const_data, alloc_tmp)
// TODO: use FMOV immediate form when `const_data` has sufficiently few mantissa/exponent bits.
// Also, treat it as half of a 128-bit vector and consider replicated patterns. Scalar MOVI
// might also be an option.
// TODO: use FMOV immediate form when `const_data` has sufficiently few mantissa/exponent
// bits. Also, treat it as half of a 128-bit vector and consider replicated
// patterns. Scalar MOVI might also be an option.
} else if const_data & (u32::MAX as u64) == 0 {
let tmp = alloc_tmp(RegClass::I64, I64);
let mut insts = Inst::load_constant(tmp, const_data);

insts.push(Inst::MovToFpu {
rd,
rn: tmp.to_reg(),
size: ScalarSize::Size64,
});

insts
Expand Down Expand Up @@ -1849,7 +1855,7 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
collector.add_def(rd);
collector.add_use(rn);
}
&Inst::MovToFpu { rd, rn } => {
&Inst::MovToFpu { rd, rn, .. } => {
collector.add_def(rd);
collector.add_use(rn);
}
Expand Down Expand Up @@ -2523,6 +2529,7 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
&mut Inst::MovToFpu {
ref mut rd,
ref mut rn,
..
} => {
map_def(mapper, rd);
map_use(mapper, rn);
Expand Down Expand Up @@ -3402,9 +3409,10 @@ impl Inst {
let rn = show_vreg_scalar(rn, mb_rru, size);
format!("{} {}, {}", inst, rd, rn)
}
&Inst::MovToFpu { rd, rn } => {
let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64);
let rn = show_ireg_sized(rn, mb_rru, OperandSize::Size64);
&Inst::MovToFpu { rd, rn, size } => {
let operand_size = size.operand_size();
let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size);
let rn = show_ireg_sized(rn, mb_rru, operand_size);
format!("fmov {}, {}", rd, rn)
}
&Inst::MovToVec { rd, rn, idx, size } => {
Expand Down
18 changes: 14 additions & 4 deletions cranelift/codegen/src/isa/aarch64/lower.rs
Original file line number Diff line number Diff line change
Expand Up @@ -837,10 +837,20 @@ pub(crate) fn lower_constant_f128<C: LowerCtx<I = Inst>>(
rd: Writable<Reg>,
value: u128,
) {
let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);

for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
ctx.emit(inst);
if value == 0 {
// Fast-track a common case. The general case, viz, calling `Inst::load_fp_constant128`,
// is potentially expensive.
ctx.emit(Inst::VecDupImm {
rd,
imm: ASIMDMovModImm::zero(),
invert: false,
size: VectorSize::Size8x16,
});
} else {
let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
ctx.emit(inst);
}
}
}

Expand Down
39 changes: 35 additions & 4 deletions cranelift/codegen/src/isa/aarch64/lower_inst.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,16 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
let vb = ctx.alloc_tmp(RegClass::V128, I128);
let ra = put_input_in_reg(ctx, inputs[0], narrow_mode);
let rb = put_input_in_reg(ctx, inputs[1], narrow_mode);
ctx.emit(Inst::MovToFpu { rd: va, rn: ra });
ctx.emit(Inst::MovToFpu { rd: vb, rn: rb });
ctx.emit(Inst::MovToFpu {
rd: va,
rn: ra,
size: ScalarSize::Size64,
});
ctx.emit(Inst::MovToFpu {
rd: vb,
rn: rb,
size: ScalarSize::Size64,
});
ctx.emit(Inst::FpuRRR {
fpu_op,
rd: va,
Expand Down Expand Up @@ -1703,7 +1711,11 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
(false, true) => {
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
ctx.emit(Inst::MovToFpu { rd, rn });
ctx.emit(Inst::MovToFpu {
rd,
rn,
size: ScalarSize::Size64,
});
}
(true, false) => {
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
Expand Down Expand Up @@ -2056,6 +2068,26 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
}
}

Opcode::ScalarToVector => {
let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rd = get_output_reg(ctx, outputs[0]);
let input_ty = ctx.input_ty(insn, 0);
if (input_ty == I32 && ty.unwrap() == I32X4)
|| (input_ty == I64 && ty.unwrap() == I64X2)
{
ctx.emit(Inst::MovToFpu {
rd,
rn,
size: ScalarSize::from_ty(input_ty),
});
} else {
return Err(CodegenError::Unsupported(format!(
"ScalarToVector: unsupported types {:?} -> {:?}",
input_ty, ty
)));
}
}

Opcode::VanyTrue | Opcode::VallTrue => {
let rd = get_output_reg(ctx, outputs[0]);
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
Expand Down Expand Up @@ -2341,7 +2373,6 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(

Opcode::Vsplit
| Opcode::Vconcat
| Opcode::ScalarToVector
| Opcode::Uload8x8Complex
| Opcode::Sload8x8Complex
| Opcode::Uload16x4Complex
Expand Down
33 changes: 33 additions & 0 deletions cranelift/filetests/filetests/isa/aarch64/simd_load_zero.clif
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
test compile
target aarch64

function %f1() -> i64x2 {
block0:
v0 = iconst.i64 281474976710657
v1 = scalar_to_vector.i64x2 v0
return v1
}

; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; nextln: movz x0, #1
; nextln: movk x0, #1, LSL #48
; nextln: fmov d0, x0
; nextln: mov sp, fp
; nextln: ldp fp, lr, [sp], #16
; nextln: ret

function %f2() -> i32x4 {
block0:
v0 = iconst.i32 42679
v1 = scalar_to_vector.i32x4 v0
return v1
}

; check: stp fp, lr, [sp, #-16]!
; nextln: mov fp, sp
; nextln: movz x0, #42679
; nextln: fmov s0, w0
; nextln: mov sp, fp
; nextln: ldp fp, lr, [sp], #16
; nextln: ret
22 changes: 16 additions & 6 deletions cranelift/wasm/src/code_translator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1426,6 +1426,18 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base);
state.push1(dfg.first_result(load))
}
Operator::V128Load32Zero { memarg } | Operator::V128Load64Zero { memarg } => {
translate_load(
memarg,
ir::Opcode::Load,
type_of(op).lane_type(),
builder,
state,
environ,
)?;
let as_vector = builder.ins().scalar_to_vector(type_of(op), state.pop1());
state.push1(as_vector)
}
Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => {
let vector = pop1_with_bitcast(state, type_of(op), builder);
let extracted = builder.ins().extractlane(vector, lane.clone());
Expand Down Expand Up @@ -1790,10 +1802,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => {
return Err(wasm_unsupported!("proposed tail-call operator {:?}", op));
}

Operator::V128Load32Zero { .. } | Operator::V128Load64Zero { .. } => {
return Err(wasm_unsupported!("proposed SIMD operator {:?}", op));
}
};
Ok(())
}
Expand Down Expand Up @@ -2516,7 +2524,8 @@ fn type_of(operator: &Operator) -> Type {
| Operator::I32x4MaxU
| Operator::F32x4ConvertI32x4S
| Operator::F32x4ConvertI32x4U
| Operator::I32x4Bitmask => I32X4,
| Operator::I32x4Bitmask
| Operator::V128Load32Zero { .. } => I32X4,

Operator::I64x2Splat
| Operator::V128Load64Splat { .. }
Expand All @@ -2528,7 +2537,8 @@ fn type_of(operator: &Operator) -> Type {
| Operator::I64x2ShrU
| Operator::I64x2Add
| Operator::I64x2Sub
| Operator::I64x2Mul => I64X2,
| Operator::I64x2Mul
| Operator::V128Load64Zero { .. } => I64X2,

Operator::F32x4Splat
| Operator::F32x4ExtractLane { .. }
Expand Down