From 3713dcced53830eec70bfb644f436c86ab489125 Mon Sep 17 00:00:00 2001 From: Julian Seward Date: Tue, 3 Nov 2020 17:34:07 +0100 Subject: [PATCH] CL/aarch64: implement the wasm SIMD `v128.load{32,64}_zero` instructions. This patch implements, for aarch64, the following wasm SIMD extensions. v128.load32_zero and v128.load64_zero instructions https://github.com/WebAssembly/simd/pull/237 The changes are straightforward: * no new CLIF instructions. They are translated into an existing CLIF scalar load followed by a CLIF `scalar_to_vector`. * the comment/specification for CLIF `scalar_to_vector` has been changed to match the actual intended semantics, per consulation with Andrew Brown. * translation from `scalar_to_vector` to aarch64 `fmov` instruction. This has been generalised slightly so as to allow both 32- and 64-bit transfers. * special-case zero in `lower_constant_f128` in order to avoid a potentially slow call to `Inst::load_fp_constant128`. * Once "Allow loads to merge into other operations during instruction selection in MachInst backends" (https://github.com/bytecodealliance/wasmtime/issues/2340) lands, we can use that functionality to pattern match the two-CLIF pair and emit a single AArch64 instruction. * A simple filetest has been added. There is no comprehensive testcase in this commit, because that is a separate repo. The implementation has been tested, nevertheless. --- .../codegen/meta/src/shared/instructions.rs | 9 ++--- .../codegen/src/isa/aarch64/inst/args.rs | 9 +++++ .../codegen/src/isa/aarch64/inst/emit.rs | 13 ++++--- .../src/isa/aarch64/inst/emit_tests.rs | 10 +++++ cranelift/codegen/src/isa/aarch64/inst/mod.rs | 26 ++++++++----- cranelift/codegen/src/isa/aarch64/lower.rs | 18 +++++++-- .../codegen/src/isa/aarch64/lower_inst.rs | 39 +++++++++++++++++-- .../filetests/isa/aarch64/simd_load_zero.clif | 33 ++++++++++++++++ cranelift/wasm/src/code_translator.rs | 22 ++++++++--- 9 files changed, 144 insertions(+), 35 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/aarch64/simd_load_zero.clif diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index 2af389ed7971..bd1444d79cd5 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -3798,12 +3798,9 @@ pub(crate) fn define( Inst::new( "scalar_to_vector", r#" - Scalar To Vector -- move a value out of a scalar register and into a vector register; the - scalar will be moved to the lowest-order bits of the vector register. Note that this - instruction is intended as a low-level legalization instruction and frontends should prefer - insertlane; on certain architectures, scalar_to_vector may zero the highest-order bits for some - types (e.g. integers) but not for others (e.g. floats). - "#, + Copies a scalar value to a vector value. The scalar is copied into the + least significant lane of the vector, and all other lanes will be zero. + "#, &formats.unary, ) .operands_in(vec![s]) diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs index e1b83eccc0f3..7bd181c86b03 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/args.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs @@ -579,6 +579,15 @@ impl ScalarSize { } } + /// Convert to an integer operand size. + pub fn operand_size(&self) -> OperandSize { + match self { + ScalarSize::Size32 => OperandSize::Size32, + ScalarSize::Size64 => OperandSize::Size64, + _ => panic!("Unexpected operand_size request for: {:?}", self), + } + } + /// Convert from a type into the smallest size that fits. pub fn from_ty(ty: Type) -> ScalarSize { Self::from_bits(ty_bits(ty)) diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 2530769879d6..5aad8f1c3004 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -1651,12 +1651,13 @@ impl MachInstEmit for Inst { }; sink.put4(enc_fround(top22, rd, rn)); } - &Inst::MovToFpu { rd, rn } => { - sink.put4( - 0b100_11110_01_1_00_111_000000_00000_00000 - | (machreg_to_gpr(rn) << 5) - | machreg_to_vec(rd.to_reg()), - ); + &Inst::MovToFpu { rd, rn, size } => { + let template = match size { + ScalarSize::Size32 => 0b000_11110_00_1_00_111_000000_00000_00000, + ScalarSize::Size64 => 0b100_11110_01_1_00_111_000000_00000_00000, + _ => unreachable!(), + }; + sink.put4(template | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg())); } &Inst::MovToVec { rd, rn, idx, size } => { let (imm5, shift) = match size.lane_size() { diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 9f8540331cbb..4c73244b92ca 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -1860,10 +1860,20 @@ fn test_aarch64_binemit() { Inst::MovToFpu { rd: writable_vreg(31), rn: xreg(0), + size: ScalarSize::Size64, }, "1F00679E", "fmov d31, x0", )); + insns.push(( + Inst::MovToFpu { + rd: writable_vreg(1), + rn: xreg(28), + size: ScalarSize::Size32, + }, + "8103271E", + "fmov s1, w28", + )); insns.push(( Inst::MovToVec { rd: writable_vreg(0), diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 6d1cb903c2d0..d38fc7fdc88f 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -877,10 +877,13 @@ pub enum Inst { rn: Reg, }, - /// Move from a GPR to a scalar FP register. + /// Move from a GPR to a vector register. The scalar value is parked in the lowest lane + /// of the destination, and all other lanes are zeroed out. Currently only 32- and 64-bit + /// transactions are supported. MovToFpu { rd: Writable, rn: Reg, + size: ScalarSize, }, /// Move to a vector element from a GPR. @@ -1319,13 +1322,15 @@ impl Inst { size: VectorSize::Size8x8 }] } else { - // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent bits. + // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent + // bits. let tmp = alloc_tmp(RegClass::I64, I32); let mut insts = Inst::load_constant(tmp, value as u64); insts.push(Inst::MovToFpu { rd, rn: tmp.to_reg(), + size: ScalarSize::Size64, }); insts @@ -1340,9 +1345,9 @@ impl Inst { ) -> SmallVec<[Inst; 4]> { if let Ok(const_data) = u32::try_from(const_data) { Inst::load_fp_constant32(rd, const_data, alloc_tmp) - // TODO: use FMOV immediate form when `const_data` has sufficiently few mantissa/exponent bits. - // Also, treat it as half of a 128-bit vector and consider replicated patterns. Scalar MOVI - // might also be an option. + // TODO: use FMOV immediate form when `const_data` has sufficiently few mantissa/exponent + // bits. Also, treat it as half of a 128-bit vector and consider replicated + // patterns. Scalar MOVI might also be an option. } else if const_data & (u32::MAX as u64) == 0 { let tmp = alloc_tmp(RegClass::I64, I64); let mut insts = Inst::load_constant(tmp, const_data); @@ -1350,6 +1355,7 @@ impl Inst { insts.push(Inst::MovToFpu { rd, rn: tmp.to_reg(), + size: ScalarSize::Size64, }); insts @@ -1849,7 +1855,7 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_def(rd); collector.add_use(rn); } - &Inst::MovToFpu { rd, rn } => { + &Inst::MovToFpu { rd, rn, .. } => { collector.add_def(rd); collector.add_use(rn); } @@ -2523,6 +2529,7 @@ fn aarch64_map_regs(inst: &mut Inst, mapper: &RUM) { &mut Inst::MovToFpu { ref mut rd, ref mut rn, + .. } => { map_def(mapper, rd); map_use(mapper, rn); @@ -3402,9 +3409,10 @@ impl Inst { let rn = show_vreg_scalar(rn, mb_rru, size); format!("{} {}, {}", inst, rd, rn) } - &Inst::MovToFpu { rd, rn } => { - let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64); - let rn = show_ireg_sized(rn, mb_rru, OperandSize::Size64); + &Inst::MovToFpu { rd, rn, size } => { + let operand_size = size.operand_size(); + let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, operand_size); format!("fmov {}, {}", rd, rn) } &Inst::MovToVec { rd, rn, idx, size } => { diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index 9549ef700aff..bcc5cbeaf063 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -837,10 +837,20 @@ pub(crate) fn lower_constant_f128>( rd: Writable, value: u128, ) { - let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty); - - for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) { - ctx.emit(inst); + if value == 0 { + // Fast-track a common case. The general case, viz, calling `Inst::load_fp_constant128`, + // is potentially expensive. + ctx.emit(Inst::VecDupImm { + rd, + imm: ASIMDMovModImm::zero(), + invert: false, + size: VectorSize::Size8x16, + }); + } else { + let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty); + for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) { + ctx.emit(inst); + } } } diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index bf5e490cdee8..957d15c57cd1 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -179,8 +179,16 @@ pub(crate) fn lower_insn_to_regs>( let vb = ctx.alloc_tmp(RegClass::V128, I128); let ra = put_input_in_reg(ctx, inputs[0], narrow_mode); let rb = put_input_in_reg(ctx, inputs[1], narrow_mode); - ctx.emit(Inst::MovToFpu { rd: va, rn: ra }); - ctx.emit(Inst::MovToFpu { rd: vb, rn: rb }); + ctx.emit(Inst::MovToFpu { + rd: va, + rn: ra, + size: ScalarSize::Size64, + }); + ctx.emit(Inst::MovToFpu { + rd: vb, + rn: rb, + size: ScalarSize::Size64, + }); ctx.emit(Inst::FpuRRR { fpu_op, rd: va, @@ -1703,7 +1711,11 @@ pub(crate) fn lower_insn_to_regs>( } (false, true) => { let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64); - ctx.emit(Inst::MovToFpu { rd, rn }); + ctx.emit(Inst::MovToFpu { + rd, + rn, + size: ScalarSize::Size64, + }); } (true, false) => { let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); @@ -2056,6 +2068,26 @@ pub(crate) fn lower_insn_to_regs>( } } + Opcode::ScalarToVector => { + let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = get_output_reg(ctx, outputs[0]); + let input_ty = ctx.input_ty(insn, 0); + if (input_ty == I32 && ty.unwrap() == I32X4) + || (input_ty == I64 && ty.unwrap() == I64X2) + { + ctx.emit(Inst::MovToFpu { + rd, + rn, + size: ScalarSize::from_ty(input_ty), + }); + } else { + return Err(CodegenError::Unsupported(format!( + "ScalarToVector: unsupported types {:?} -> {:?}", + input_ty, ty + ))); + } + } + Opcode::VanyTrue | Opcode::VallTrue => { let rd = get_output_reg(ctx, outputs[0]); let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); @@ -2341,7 +2373,6 @@ pub(crate) fn lower_insn_to_regs>( Opcode::Vsplit | Opcode::Vconcat - | Opcode::ScalarToVector | Opcode::Uload8x8Complex | Opcode::Sload8x8Complex | Opcode::Uload16x4Complex diff --git a/cranelift/filetests/filetests/isa/aarch64/simd_load_zero.clif b/cranelift/filetests/filetests/isa/aarch64/simd_load_zero.clif new file mode 100644 index 000000000000..9919c3c09bb4 --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/simd_load_zero.clif @@ -0,0 +1,33 @@ +test compile +target aarch64 + +function %f1() -> i64x2 { +block0: + v0 = iconst.i64 281474976710657 + v1 = scalar_to_vector.i64x2 v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x0, #1 +; nextln: movk x0, #1, LSL #48 +; nextln: fmov d0, x0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f2() -> i32x4 { +block0: + v0 = iconst.i32 42679 + v1 = scalar_to_vector.i32x4 v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x0, #42679 +; nextln: fmov s0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index 922f085f1c34..fcd68e51e548 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -1426,6 +1426,18 @@ pub fn translate_operator( let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base); state.push1(dfg.first_result(load)) } + Operator::V128Load32Zero { memarg } | Operator::V128Load64Zero { memarg } => { + translate_load( + memarg, + ir::Opcode::Load, + type_of(op).lane_type(), + builder, + state, + environ, + )?; + let as_vector = builder.ins().scalar_to_vector(type_of(op), state.pop1()); + state.push1(as_vector) + } Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => { let vector = pop1_with_bitcast(state, type_of(op), builder); let extracted = builder.ins().extractlane(vector, lane.clone()); @@ -1790,10 +1802,6 @@ pub fn translate_operator( Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => { return Err(wasm_unsupported!("proposed tail-call operator {:?}", op)); } - - Operator::V128Load32Zero { .. } | Operator::V128Load64Zero { .. } => { - return Err(wasm_unsupported!("proposed SIMD operator {:?}", op)); - } }; Ok(()) } @@ -2516,7 +2524,8 @@ fn type_of(operator: &Operator) -> Type { | Operator::I32x4MaxU | Operator::F32x4ConvertI32x4S | Operator::F32x4ConvertI32x4U - | Operator::I32x4Bitmask => I32X4, + | Operator::I32x4Bitmask + | Operator::V128Load32Zero { .. } => I32X4, Operator::I64x2Splat | Operator::V128Load64Splat { .. } @@ -2528,7 +2537,8 @@ fn type_of(operator: &Operator) -> Type { | Operator::I64x2ShrU | Operator::I64x2Add | Operator::I64x2Sub - | Operator::I64x2Mul => I64X2, + | Operator::I64x2Mul + | Operator::V128Load64Zero { .. } => I64X2, Operator::F32x4Splat | Operator::F32x4ExtractLane { .. }