Skip to content

Commit

Permalink
Implement intrinsic for swapping values
Browse files Browse the repository at this point in the history
This allows move target- and backend-specific optmization from library code to codegen.
Also, this should make const eval/miri evaluation simpler.

Main optimization implemented in this PR makes backend generate swap without using allocas
removing unneccessary memory writes and reads and reducing stack usage.

One of the main optimizations is using larger integer chunks for swapping in x86_64 by utilizing unaligned reads/writes. It reduces code size (especially for debug builds) and prevent cases of ineffective vectorizations like `load <4 x i8>` (LLVM doesn't vectorize it further despite vectorizing `load i32`).

Also added more tests.
  • Loading branch information
AngelicosPhosphoros committed Jun 12, 2023
1 parent 81c02da commit c6db014
Show file tree
Hide file tree
Showing 22 changed files with 1,309 additions and 113 deletions.
2 changes: 1 addition & 1 deletion compiler/rustc_codegen_cranelift/src/driver/jit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ fn dep_symbol_lookup_fn(
Linkage::NotLinked | Linkage::IncludedFromDylib => {}
Linkage::Static => {
let name = crate_info.crate_name[&cnum];
let mut err = sess.struct_err(&format!("Can't load static lib {}", name));
let mut err = sess.struct_err(format!("Can't load static lib {}", name));
err.note("rustc_codegen_cranelift can only load dylibs in JIT mode.");
err.emit();
}
Expand Down
124 changes: 124 additions & 0 deletions compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,130 @@ fn codegen_regular_intrinsic_call<'tcx>(
// FIXME use emit_small_memset
fx.bcx.call_memset(fx.target_config, dst_ptr, val, count);
}

sym::swap_nonoverlapping_single => {
intrinsic_args!(fx, args => (x_ptr, y_ptr); intrinsic);
let pointee_ty = x_ptr.layout().ty.builtin_deref(true).unwrap().ty;
let pointee_layout = fx.layout_of(pointee_ty);

// ZSTs swap is noop.
if pointee_layout.size != Size::ZERO {
// Probably, it would be better to have dedicated method for this in
// `cranelift_frontend::FunctionBuilder`
// with optimizations based on size and alignment of values.

let x_ptr_val = x_ptr.load_scalar(fx);
let y_ptr_val = y_ptr.load_scalar(fx);

let tmp_place = CPlace::new_stack_slot(fx, pointee_layout);
let tmp_ptr_val = tmp_place.to_ptr().get_addr(fx);

let size_bytes = pointee_layout.size.bytes();
let align_bytes: u8 = pointee_layout.align.abi.bytes().try_into().unwrap();
fx.bcx.emit_small_memory_copy(
fx.target_config,
tmp_ptr_val,
x_ptr_val,
size_bytes,
align_bytes,
align_bytes,
true,
MemFlags::trusted(),
);
fx.bcx.emit_small_memory_copy(
fx.target_config,
x_ptr_val,
y_ptr_val,
size_bytes,
align_bytes,
align_bytes,
true,
MemFlags::trusted(),
);
fx.bcx.emit_small_memory_copy(
fx.target_config,
y_ptr_val,
tmp_ptr_val,
size_bytes,
align_bytes,
align_bytes,
true,
MemFlags::trusted(),
);
}
}

sym::swap_nonoverlapping_many => {
intrinsic_args!(fx, args => (x_ptr, y_ptr, count); intrinsic);
let pointee_ty = x_ptr.layout().ty.builtin_deref(true).unwrap().ty;
let pointee_layout = fx.layout_of(pointee_ty);

// ZSTs swap is noop.
if pointee_layout.size != Size::ZERO {
let x_ptr_val = x_ptr.load_scalar(fx);
let y_ptr_val = y_ptr.load_scalar(fx);

let count = count.load_scalar(fx);

let tmp_place = CPlace::new_stack_slot(fx, pointee_layout);
let tmp_ptr_val = tmp_place.to_ptr().get_addr(fx);

let elem_size_bytes = pointee_layout.size.bytes();
let align_bytes: u8 = pointee_layout.align.abi.bytes().try_into().unwrap();

let loop_header = fx.bcx.create_block();
let loop_body = fx.bcx.create_block();
let loop_done = fx.bcx.create_block();

let index = fx.bcx.append_block_param(loop_header, fx.pointer_type);
let zero = fx.bcx.ins().iconst(fx.pointer_type, 0);
fx.bcx.ins().jump(loop_header, &[zero]);

fx.bcx.switch_to_block(loop_header);
let is_done = fx.bcx.ins().icmp(IntCC::Equal, index, count);
fx.bcx.ins().brif(is_done, loop_done, &[], loop_body, &[]);

fx.bcx.switch_to_block(loop_body);
let curr_x_ptr_val = fx.bcx.ins().iadd(x_ptr_val, index);
let curr_y_ptr_val = fx.bcx.ins().iadd(y_ptr_val, index);
fx.bcx.emit_small_memory_copy(
fx.target_config,
tmp_ptr_val,
curr_x_ptr_val,
elem_size_bytes,
align_bytes,
align_bytes,
true,
MemFlags::trusted(),
);
fx.bcx.emit_small_memory_copy(
fx.target_config,
curr_x_ptr_val,
curr_y_ptr_val,
elem_size_bytes,
align_bytes,
align_bytes,
true,
MemFlags::trusted(),
);
fx.bcx.emit_small_memory_copy(
fx.target_config,
curr_y_ptr_val,
tmp_ptr_val,
elem_size_bytes,
align_bytes,
align_bytes,
true,
MemFlags::trusted(),
);
let next_index = fx.bcx.ins().iadd_imm(index, 1);
fx.bcx.ins().jump(loop_header, &[next_index]);

fx.bcx.switch_to_block(loop_done);
fx.bcx.ins().nop();
}
}

sym::ctlz | sym::ctlz_nonzero => {
intrinsic_args!(fx, args => (arg); intrinsic);
let val = arg.load_scalar(fx);
Expand Down
49 changes: 49 additions & 0 deletions compiler/rustc_codegen_gcc/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1070,6 +1070,55 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {
self.block.add_eval(None, self.context.new_call(None, memset, &[ptr, fill_byte, size]));
}

fn make_memory_loop<BodyPtrsVisitor, const VAR_COUNT: usize>(
&mut self,
loop_name: &str,
start_ptrs: [Self::Value; VAR_COUNT],
steps: [Size; VAR_COUNT],
iterations: Self::Value,
body_visitor: BodyPtrsVisitor,
) where
BodyPtrsVisitor: FnOnce(&mut Self, &[Self::Value; VAR_COUNT]),
{
assert!(VAR_COUNT > 0, "VAR_COUNT must be bigger than zero.");

for step in steps {
assert_ne!(step.bytes(), 0, "We are iterating over memory, ZSTs unexpected.");
}

let header_bb = self.append_sibling_block(&format!("{}_header", loop_name));
let body_bb = self.append_sibling_block(&format!("{}_body", loop_name));
let next_bb = self.append_sibling_block(&format!("{}_next", loop_name));

let zero = self.const_usize(0);
let additions: [Self::Value; VAR_COUNT] = steps.map(|st| self.const_usize(st.bytes()));

let loop_i = self.llbb().get_function().new_local(None, self.type_size_t(), "loop_i");
self.assign(loop_i, zero);
let loop_i_val = loop_i.to_rvalue();

self.br(header_bb);

self.switch_to_block(header_bb);
let keep_going = self.icmp(IntPredicate::IntNE, loop_i_val, iterations);
self.cond_br(keep_going, body_bb, next_bb);

self.switch_to_block(body_bb);
let current_ptrs: [Self::Value; VAR_COUNT] = core::array::from_fn(
|i|{
let start = self.pointercast(start_ptrs[i], self.type_i8p());
let offset = self.unchecked_umul(additions[i], loop_i_val);
self.inbounds_gep(self.type_i8(), start, &[offset])
}
);
body_visitor(self, &current_ptrs);
let next_i = self.unchecked_uadd(loop_i_val, self.const_usize(1));
self.assign(loop_i, next_i);
self.br(header_bb);

self.switch_to_block(next_bb);
}

fn select(&mut self, cond: RValue<'gcc>, then_val: RValue<'gcc>, mut else_val: RValue<'gcc>) -> RValue<'gcc> {
let func = self.current_func();
let variable = func.new_local(None, then_val.get_type(), "selectVar");
Expand Down
48 changes: 48 additions & 0 deletions compiler/rustc_codegen_llvm/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -935,6 +935,54 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
}
}

fn make_memory_loop<BodyPtrsVisitor, const VAR_COUNT: usize>(
&mut self,
loop_name: &str,
start_ptrs: [Self::Value; VAR_COUNT],
steps: [Size; VAR_COUNT],
iterations: Self::Value,
body_visitor: BodyPtrsVisitor,
) where
BodyPtrsVisitor: FnOnce(&mut Self, &[Self::Value; VAR_COUNT]),
{
const {
assert!(VAR_COUNT > 0, "VAR_COUNT must be bigger than zero.");
}
for step in steps {
assert_ne!(step.bytes(), 0, "We are iterating over memory, ZSTs unexpected.");
}

let zero = self.const_usize(0);
let additions: [Self::Value; VAR_COUNT] = steps.map(|st| self.const_usize(st.bytes()));

let header_bb = self.append_sibling_block(&format!("{}_header", loop_name));
let body_bb = self.append_sibling_block(&format!("{}_body", loop_name));
let next_bb = self.append_sibling_block(&format!("{}_next", loop_name));
self.br(header_bb);

let mut header_bx = Builder::build(self.cx, header_bb);
// Use integer for iteration instead of pointers because LLVM canonicalize loop into indexed anyway.
let loop_i = header_bx.phi(self.type_isize(), &[zero], &[self.llbb()]);
let keep_going = header_bx.icmp(IntPredicate::IntNE, loop_i, iterations);
header_bx.cond_br(keep_going, body_bb, next_bb);

let mut body_bx = Builder::build(self.cx, body_bb);
let current_ptrs: [Self::Value; VAR_COUNT] = std::array::from_fn(|i| {
let start = start_ptrs[i];
// FIXME: Remove pointercast after dropping supporting of LLVM 14.
let start = self.pointercast(start, self.type_i8p());
let addition = additions[i];
let offset = body_bx.unchecked_umul(loop_i, addition);
body_bx.inbounds_gep(body_bx.type_i8(), start, &[offset])
});
body_visitor(&mut body_bx, &current_ptrs);
let next_i = body_bx.unchecked_uadd(loop_i, body_bx.const_usize(1));
header_bx.add_incoming_to_phi(loop_i, next_i, body_bb);
body_bx.br(header_bb);

*self = Builder::build(self.cx, next_bb);
}

fn select(
&mut self,
cond: &'ll Value,
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_codegen_llvm/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#![doc(html_root_url = "https://doc.rust-lang.org/nightly/nightly-rustc/")]
#![feature(extern_types)]
#![feature(hash_raw_entry)]
#![feature(inline_const)]
#![feature(iter_intersperse)]
#![feature(let_chains)]
#![feature(never_type)]
Expand Down
Loading

0 comments on commit c6db014

Please sign in to comment.