From c6db0144bac34b0fed392fc6c548e665c47925ee Mon Sep 17 00:00:00 2001 From: AngelicosPhosphoros Date: Wed, 17 May 2023 02:22:10 +0400 Subject: [PATCH] Implement intrinsic for swapping values This allows move target- and backend-specific optmization from library code to codegen. Also, this should make const eval/miri evaluation simpler. Main optimization implemented in this PR makes backend generate swap without using allocas removing unneccessary memory writes and reads and reducing stack usage. One of the main optimizations is using larger integer chunks for swapping in x86_64 by utilizing unaligned reads/writes. It reduces code size (especially for debug builds) and prevent cases of ineffective vectorizations like `load <4 x i8>` (LLVM doesn't vectorize it further despite vectorizing `load i32`). Also added more tests. --- .../rustc_codegen_cranelift/src/driver/jit.rs | 2 +- .../src/intrinsics/mod.rs | 124 +++++ compiler/rustc_codegen_gcc/src/builder.rs | 49 ++ compiler/rustc_codegen_llvm/src/builder.rs | 48 ++ compiler/rustc_codegen_llvm/src/lib.rs | 1 + .../rustc_codegen_ssa/src/mir/intrinsic.rs | 469 +++++++++++++++++- .../rustc_codegen_ssa/src/traits/builder.rs | 15 + .../src/interpret/intrinsics.rs | 18 + .../rustc_const_eval/src/interpret/memory.rs | 59 ++- .../rustc_hir_analysis/src/check/intrinsic.rs | 17 + compiler/rustc_span/src/symbol.rs | 2 + library/core/src/intrinsics.rs | 61 +++ library/core/src/mem/mod.rs | 64 ++- library/core/src/ptr/mod.rs | 39 +- library/core/tests/mem.rs | 86 ++++ library/core/tests/ptr.rs | 22 + tests/codegen/swap-large-types.rs | 45 +- tests/codegen/swap-simd-types.rs | 6 +- tests/codegen/swap-small-types.rs | 110 +++- .../consts/missing_span_in_backtrace.stderr | 6 - .../intrinsics/swap_nonoverlapping_single.rs | 132 +++++ .../swap_nonoverlapping_single.stderr | 47 ++ 22 files changed, 1309 insertions(+), 113 deletions(-) create mode 100644 tests/ui/intrinsics/swap_nonoverlapping_single.rs create mode 100644 tests/ui/intrinsics/swap_nonoverlapping_single.stderr diff --git a/compiler/rustc_codegen_cranelift/src/driver/jit.rs b/compiler/rustc_codegen_cranelift/src/driver/jit.rs index 3118105a4e2d7..62a3e09691234 100644 --- a/compiler/rustc_codegen_cranelift/src/driver/jit.rs +++ b/compiler/rustc_codegen_cranelift/src/driver/jit.rs @@ -325,7 +325,7 @@ fn dep_symbol_lookup_fn( Linkage::NotLinked | Linkage::IncludedFromDylib => {} Linkage::Static => { let name = crate_info.crate_name[&cnum]; - let mut err = sess.struct_err(&format!("Can't load static lib {}", name)); + let mut err = sess.struct_err(format!("Can't load static lib {}", name)); err.note("rustc_codegen_cranelift can only load dylibs in JIT mode."); err.emit(); } diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs index 1e83c30bd677a..38c120fd4ba86 100644 --- a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs +++ b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs @@ -567,6 +567,130 @@ fn codegen_regular_intrinsic_call<'tcx>( // FIXME use emit_small_memset fx.bcx.call_memset(fx.target_config, dst_ptr, val, count); } + + sym::swap_nonoverlapping_single => { + intrinsic_args!(fx, args => (x_ptr, y_ptr); intrinsic); + let pointee_ty = x_ptr.layout().ty.builtin_deref(true).unwrap().ty; + let pointee_layout = fx.layout_of(pointee_ty); + + // ZSTs swap is noop. + if pointee_layout.size != Size::ZERO { + // Probably, it would be better to have dedicated method for this in + // `cranelift_frontend::FunctionBuilder` + // with optimizations based on size and alignment of values. + + let x_ptr_val = x_ptr.load_scalar(fx); + let y_ptr_val = y_ptr.load_scalar(fx); + + let tmp_place = CPlace::new_stack_slot(fx, pointee_layout); + let tmp_ptr_val = tmp_place.to_ptr().get_addr(fx); + + let size_bytes = pointee_layout.size.bytes(); + let align_bytes: u8 = pointee_layout.align.abi.bytes().try_into().unwrap(); + fx.bcx.emit_small_memory_copy( + fx.target_config, + tmp_ptr_val, + x_ptr_val, + size_bytes, + align_bytes, + align_bytes, + true, + MemFlags::trusted(), + ); + fx.bcx.emit_small_memory_copy( + fx.target_config, + x_ptr_val, + y_ptr_val, + size_bytes, + align_bytes, + align_bytes, + true, + MemFlags::trusted(), + ); + fx.bcx.emit_small_memory_copy( + fx.target_config, + y_ptr_val, + tmp_ptr_val, + size_bytes, + align_bytes, + align_bytes, + true, + MemFlags::trusted(), + ); + } + } + + sym::swap_nonoverlapping_many => { + intrinsic_args!(fx, args => (x_ptr, y_ptr, count); intrinsic); + let pointee_ty = x_ptr.layout().ty.builtin_deref(true).unwrap().ty; + let pointee_layout = fx.layout_of(pointee_ty); + + // ZSTs swap is noop. + if pointee_layout.size != Size::ZERO { + let x_ptr_val = x_ptr.load_scalar(fx); + let y_ptr_val = y_ptr.load_scalar(fx); + + let count = count.load_scalar(fx); + + let tmp_place = CPlace::new_stack_slot(fx, pointee_layout); + let tmp_ptr_val = tmp_place.to_ptr().get_addr(fx); + + let elem_size_bytes = pointee_layout.size.bytes(); + let align_bytes: u8 = pointee_layout.align.abi.bytes().try_into().unwrap(); + + let loop_header = fx.bcx.create_block(); + let loop_body = fx.bcx.create_block(); + let loop_done = fx.bcx.create_block(); + + let index = fx.bcx.append_block_param(loop_header, fx.pointer_type); + let zero = fx.bcx.ins().iconst(fx.pointer_type, 0); + fx.bcx.ins().jump(loop_header, &[zero]); + + fx.bcx.switch_to_block(loop_header); + let is_done = fx.bcx.ins().icmp(IntCC::Equal, index, count); + fx.bcx.ins().brif(is_done, loop_done, &[], loop_body, &[]); + + fx.bcx.switch_to_block(loop_body); + let curr_x_ptr_val = fx.bcx.ins().iadd(x_ptr_val, index); + let curr_y_ptr_val = fx.bcx.ins().iadd(y_ptr_val, index); + fx.bcx.emit_small_memory_copy( + fx.target_config, + tmp_ptr_val, + curr_x_ptr_val, + elem_size_bytes, + align_bytes, + align_bytes, + true, + MemFlags::trusted(), + ); + fx.bcx.emit_small_memory_copy( + fx.target_config, + curr_x_ptr_val, + curr_y_ptr_val, + elem_size_bytes, + align_bytes, + align_bytes, + true, + MemFlags::trusted(), + ); + fx.bcx.emit_small_memory_copy( + fx.target_config, + curr_y_ptr_val, + tmp_ptr_val, + elem_size_bytes, + align_bytes, + align_bytes, + true, + MemFlags::trusted(), + ); + let next_index = fx.bcx.ins().iadd_imm(index, 1); + fx.bcx.ins().jump(loop_header, &[next_index]); + + fx.bcx.switch_to_block(loop_done); + fx.bcx.ins().nop(); + } + } + sym::ctlz | sym::ctlz_nonzero => { intrinsic_args!(fx, args => (arg); intrinsic); let val = arg.load_scalar(fx); diff --git a/compiler/rustc_codegen_gcc/src/builder.rs b/compiler/rustc_codegen_gcc/src/builder.rs index f9ea0f004564b..3eec43d3745f8 100644 --- a/compiler/rustc_codegen_gcc/src/builder.rs +++ b/compiler/rustc_codegen_gcc/src/builder.rs @@ -1070,6 +1070,55 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> { self.block.add_eval(None, self.context.new_call(None, memset, &[ptr, fill_byte, size])); } + fn make_memory_loop( + &mut self, + loop_name: &str, + start_ptrs: [Self::Value; VAR_COUNT], + steps: [Size; VAR_COUNT], + iterations: Self::Value, + body_visitor: BodyPtrsVisitor, + ) where + BodyPtrsVisitor: FnOnce(&mut Self, &[Self::Value; VAR_COUNT]), + { + assert!(VAR_COUNT > 0, "VAR_COUNT must be bigger than zero."); + + for step in steps { + assert_ne!(step.bytes(), 0, "We are iterating over memory, ZSTs unexpected."); + } + + let header_bb = self.append_sibling_block(&format!("{}_header", loop_name)); + let body_bb = self.append_sibling_block(&format!("{}_body", loop_name)); + let next_bb = self.append_sibling_block(&format!("{}_next", loop_name)); + + let zero = self.const_usize(0); + let additions: [Self::Value; VAR_COUNT] = steps.map(|st| self.const_usize(st.bytes())); + + let loop_i = self.llbb().get_function().new_local(None, self.type_size_t(), "loop_i"); + self.assign(loop_i, zero); + let loop_i_val = loop_i.to_rvalue(); + + self.br(header_bb); + + self.switch_to_block(header_bb); + let keep_going = self.icmp(IntPredicate::IntNE, loop_i_val, iterations); + self.cond_br(keep_going, body_bb, next_bb); + + self.switch_to_block(body_bb); + let current_ptrs: [Self::Value; VAR_COUNT] = core::array::from_fn( + |i|{ + let start = self.pointercast(start_ptrs[i], self.type_i8p()); + let offset = self.unchecked_umul(additions[i], loop_i_val); + self.inbounds_gep(self.type_i8(), start, &[offset]) + } + ); + body_visitor(self, ¤t_ptrs); + let next_i = self.unchecked_uadd(loop_i_val, self.const_usize(1)); + self.assign(loop_i, next_i); + self.br(header_bb); + + self.switch_to_block(next_bb); + } + fn select(&mut self, cond: RValue<'gcc>, then_val: RValue<'gcc>, mut else_val: RValue<'gcc>) -> RValue<'gcc> { let func = self.current_func(); let variable = func.new_local(None, then_val.get_type(), "selectVar"); diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs index b4aa001547c4c..adf96ca8cd240 100644 --- a/compiler/rustc_codegen_llvm/src/builder.rs +++ b/compiler/rustc_codegen_llvm/src/builder.rs @@ -935,6 +935,54 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> { } } + fn make_memory_loop( + &mut self, + loop_name: &str, + start_ptrs: [Self::Value; VAR_COUNT], + steps: [Size; VAR_COUNT], + iterations: Self::Value, + body_visitor: BodyPtrsVisitor, + ) where + BodyPtrsVisitor: FnOnce(&mut Self, &[Self::Value; VAR_COUNT]), + { + const { + assert!(VAR_COUNT > 0, "VAR_COUNT must be bigger than zero."); + } + for step in steps { + assert_ne!(step.bytes(), 0, "We are iterating over memory, ZSTs unexpected."); + } + + let zero = self.const_usize(0); + let additions: [Self::Value; VAR_COUNT] = steps.map(|st| self.const_usize(st.bytes())); + + let header_bb = self.append_sibling_block(&format!("{}_header", loop_name)); + let body_bb = self.append_sibling_block(&format!("{}_body", loop_name)); + let next_bb = self.append_sibling_block(&format!("{}_next", loop_name)); + self.br(header_bb); + + let mut header_bx = Builder::build(self.cx, header_bb); + // Use integer for iteration instead of pointers because LLVM canonicalize loop into indexed anyway. + let loop_i = header_bx.phi(self.type_isize(), &[zero], &[self.llbb()]); + let keep_going = header_bx.icmp(IntPredicate::IntNE, loop_i, iterations); + header_bx.cond_br(keep_going, body_bb, next_bb); + + let mut body_bx = Builder::build(self.cx, body_bb); + let current_ptrs: [Self::Value; VAR_COUNT] = std::array::from_fn(|i| { + let start = start_ptrs[i]; + // FIXME: Remove pointercast after dropping supporting of LLVM 14. + let start = self.pointercast(start, self.type_i8p()); + let addition = additions[i]; + let offset = body_bx.unchecked_umul(loop_i, addition); + body_bx.inbounds_gep(body_bx.type_i8(), start, &[offset]) + }); + body_visitor(&mut body_bx, ¤t_ptrs); + let next_i = body_bx.unchecked_uadd(loop_i, body_bx.const_usize(1)); + header_bx.add_incoming_to_phi(loop_i, next_i, body_bb); + body_bx.br(header_bb); + + *self = Builder::build(self.cx, next_bb); + } + fn select( &mut self, cond: &'ll Value, diff --git a/compiler/rustc_codegen_llvm/src/lib.rs b/compiler/rustc_codegen_llvm/src/lib.rs index 24968e00cc8e5..35ac7b33d3ff7 100644 --- a/compiler/rustc_codegen_llvm/src/lib.rs +++ b/compiler/rustc_codegen_llvm/src/lib.rs @@ -7,6 +7,7 @@ #![doc(html_root_url = "https://doc.rust-lang.org/nightly/nightly-rustc/")] #![feature(extern_types)] #![feature(hash_raw_entry)] +#![feature(inline_const)] #![feature(iter_intersperse)] #![feature(let_chains)] #![feature(never_type)] diff --git a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs index 9ac2424e76be0..bf9610e5ee6ce 100644 --- a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs +++ b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs @@ -9,12 +9,12 @@ use crate::meth; use crate::traits::*; use crate::MemFlags; -use rustc_middle::ty::{self, Ty, TyCtxt}; -use rustc_span::{sym, Span}; -use rustc_target::abi::{ - call::{FnAbi, PassMode}, - WrappingRange, -}; +use rustc_middle::ty; +use rustc_middle::ty::{Ty, TyCtxt}; +use rustc_span::sym; +use rustc_span::Span; +use rustc_target::abi::call::{FnAbi, PassMode}; +use rustc_target::abi::WrappingRange; fn copy_intrinsic<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( bx: &mut Bx, @@ -37,6 +37,442 @@ fn copy_intrinsic<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( } } +mod swap_intrinsic { + use crate::traits::*; + use crate::MemFlags; + + use rustc_middle::mir::interpret::PointerArithmetic; + use rustc_middle::ty::Ty; + use rustc_span::Span; + use rustc_target::abi::{Align, Size}; + use rustc_target::spec::HasTargetSpec; + + // Note: We deliberately interpret our values as some ranges of bytes + // for performance like did earlier in the old `core::mem::swap` implementation + // and use immediate values instead of PlaceRefs. + pub(super) fn single<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( + bx: &mut Bx, + x_ptr: Bx::Value, + y_ptr: Bx::Value, + ty: Ty<'tcx>, + span: Span, + ) { + let layout = bx.layout_of(ty); + if layout.is_unsized() { + span_bug!(span, "swap_nonoverlapping_single must be called only for sized types"); + } + if layout.is_zst() { + // no-op + return; + } + let should_use_2_temp_vals = { + // Primitive integer or something equal to it by size. + (layout.size <= bx.cx().pointer_size() && layout.size.bytes().is_power_of_two()) + // SPIR-V doesn't allow reinterpretation of values as chunks of arbitrary ints + // so we need to read and copy them full. + // For small values we use double read-double write. + || (layout.size <= bx.cx().pointer_size() && bx.cx().target_spec().arch == "spirv") + }; + if should_use_2_temp_vals { + let ty = bx.backend_type(layout); + let align = layout.align.abi; + swap_using_2_temps(bx, x_ptr, y_ptr, ty, align); + return; + } + + // If need to swap large value, + // it probably better to do single memcpy from one elem + // to another after saving the old value. + let should_use_single_temp_val = { + // Most likely some `Simd` type from portable simd or manual simd. + // There is no difference with double read in release build + // but it reduces amount of code generated in debug build. + (layout.align.abi.bytes() == layout.size.bytes() && layout.size > bx.cx().pointer_size()) + // Probably aggregate with some SIMD type field. + // E.g. `Option`. + // Need to think how to do it better. + || layout.align.abi > bx.data_layout().pointer_align.abi + // SPIRV doesn't allow partial reads/writes and value reinterpretations + // so our best chance to reduce stack usage is to use single alloca. + || bx.cx().target_spec().arch == "spirv" + }; + if should_use_single_temp_val { + let ty = bx.backend_type(layout); + swap_using_single_temp(bx, x_ptr, y_ptr, ty, layout.size, layout.align.abi); + return; + } + + // Both LLVM and GCC seem to benefit from same splitting loops + // so place this code here to prevent duplication. + // https://godbolt.org/z/arzvePb8T + + if bx.cx().target_spec().arch == "x86_64" { + swap_unaligned_x86_64_single(bx, layout, x_ptr, y_ptr); + return; + } + + // Swap using aligned integers as chunks. + assert!(layout.align.abi.bytes() <= bx.pointer_size().bytes()); + assert_eq!(bx.data_layout().pointer_align.abi.bytes(), bx.pointer_size().bytes()); + let chunk_size = std::cmp::min(layout.align.abi.bytes(), bx.pointer_size().bytes()); + let chunk_size = Size::from_bytes(chunk_size); + make_swaps_loop( + bx, + x_ptr, + y_ptr, + ToSwap::Bytes(layout.size), + ChunkInfo::IntChunk(chunk_size), + NumOfTemps::Two, + Align::from_bytes(chunk_size.bytes()).unwrap(), + ); + } + + // `x86_64` allows optimization using unaligned accesses + // because unaligned reads/writes are fast on x86_64. + // https://lemire.me/blog/2012/05/31/data-alignment-for-speed-myth-or-reality/ + // We manually swap last `x % ZMM_BYTES` bytes in a way that would always vectorize + // them AVX and/or SSE because both GCC and LLVM generate fails to use smaller SIMD registers + // if they had used larger ones. + fn swap_unaligned_x86_64_single<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( + bx: &mut Bx, + layout: Bx::LayoutOfResult, + x_ptr: Bx::Value, + y_ptr: Bx::Value, + ) { + const ZMM_BYTES: u64 = 512 / 8; + const YMM_BYTES: u64 = 256 / 8; + const XMM_BYTES: u64 = 128 / 8; + + let min_align = Align::from_bytes(1).expect("One is always valid align."); + let ptr_size = bx.cx().pointer_size(); + // Need to do pointercasts because `rustc_codegen_gcc` ignores passed type + // in `inbounds_gep`. + let x_ptr = bx.pointercast(x_ptr, bx.type_i8p()); + let y_ptr = bx.pointercast(y_ptr, bx.type_i8p()); + + let mut total_offset = Size::ZERO; + // Make a loop that is vectorized using largest vectors. + // It would use largest available vectors, not necessary ZMM. + if layout.size.bytes() >= ZMM_BYTES { + let to_swap = Size::from_bytes(layout.size.bytes() / ZMM_BYTES * ZMM_BYTES); + make_swaps_loop( + bx, + x_ptr, + y_ptr, + ToSwap::Bytes(to_swap), + ChunkInfo::IntChunk(ptr_size), + NumOfTemps::Two, + min_align, + ); + total_offset += to_swap; + } + // This loop contents are based on knowledge from this: https://godbolt.org/z/Mr4rWfoad + // And this: https://godbolt.org/z/YzcWofG5Y + // Both LLVM and GCC fail to use SIMD registers for swapping tails without this. + for (num_temps, chunk_size) in [(4, YMM_BYTES), (2, XMM_BYTES)] { + let chunk_size = Size::from_bytes(chunk_size); + assert_eq!( + ptr_size * num_temps, + chunk_size, + "Invalid assumption about pointer size or register size", + ); + if layout.size < total_offset + chunk_size { + continue; + } + + let x_tmps_and_offsets: Vec<_> = (0..num_temps) + .map(|i| { + let curr_off = total_offset + i * ptr_size; + let curr_off = bx.const_usize(curr_off.bytes()); + let x_gep = bx.inbounds_gep(bx.type_i8(), x_ptr, &[curr_off]); + // FIXME: Remove pointercast after stopping support of LLVM 14. + let x_gep = bx.pointercast(x_gep, bx.type_ptr_to(bx.type_isize())); + (bx.load(bx.type_isize(), x_gep, min_align), curr_off) + }) + .collect(); + + let chunk_size_val = bx.const_usize(chunk_size.bytes()); + let chunk_offset = bx.const_usize(total_offset.bytes()); + let x_chunk_gep = bx.inbounds_gep(bx.type_i8(), x_ptr, &[chunk_offset]); + let y_chunk_gep = bx.inbounds_gep(bx.type_i8(), y_ptr, &[chunk_offset]); + // FIXME(AngelicosPhosphoros): Use memcpy.inline here. + bx.memcpy( + x_chunk_gep, + min_align, + y_chunk_gep, + min_align, + chunk_size_val, + MemFlags::UNALIGNED, + ); + for (x_tmp, curr_off) in x_tmps_and_offsets { + let y_gep = bx.inbounds_gep(bx.type_i8(), y_ptr, &[curr_off]); + // FIXME: Remove pointercast after stopping support of LLVM 14. + let y_gep = bx.pointercast(y_gep, bx.type_ptr_to(bx.type_isize())); + bx.store(x_tmp, y_gep, min_align); + } + + total_offset += chunk_size; + } + + // I decided to use swaps by pow2 ints here based + // on this codegen example: https://godbolt.org/z/rWYqMGnWh + // This loops implements it using minimal amount of instructions + // and registers involved. + let mut current_size = bx.pointer_size(); + while total_offset < layout.size { + // In each loop iteration, remaining amount of unswapped bytes + // is less than in previous iteration. + + assert_ne!(current_size, Size::ZERO, "We must had finished swapping when it was 1"); + + let next_size = Size::from_bytes(current_size.bytes() / 2); + if total_offset + current_size > layout.size { + current_size = next_size; + continue; + } + + let tail_offset = bx.const_usize(total_offset.bytes()); + let x_tail_ptr = bx.inbounds_gep(bx.type_i8(), x_ptr, &[tail_offset]); + let y_tail_ptr = bx.inbounds_gep(bx.type_i8(), y_ptr, &[tail_offset]); + + let chunt_ty = choose_int_by_size(bx, current_size); + swap_using_2_temps(bx, x_tail_ptr, y_tail_ptr, chunt_ty, min_align); + + total_offset += current_size; + current_size = next_size; + } + } + + // We cannot use some of optimizations available for [`single`] + // because we don't know how many bytes exactly we need to swap. + pub(super) fn many<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( + bx: &mut Bx, + x_ptr: Bx::Value, + y_ptr: Bx::Value, + count: Bx::Value, + ty: Ty<'tcx>, + span: Span, + ) { + let layout = bx.layout_of(ty); + if layout.is_unsized() { + span_bug!(span, "swap_nonoverlapping_many must be called only for sized types"); + } + if layout.is_zst() { + // no-op + return; + } + + let must_not_split_values = { + // Unusual type, maybe some manual SIMD optimization. + layout.align.abi > bx.data_layout().pointer_align.abi && layout.align.abi.bytes() == layout.size.bytes() + // Probably aggregate with some SIMD type field. + // E.g. `Option`. + // Need to think how to do it better. + || layout.align.abi > bx.data_layout().pointer_align.abi + // SPIR-V doesn't allow reinterpretation of values as chunks of arbitrary ints + // so we need to read and copy them by element full. + || bx.cx().target_spec().arch == "spirv" + }; + + if must_not_split_values { + let back_ty = bx.backend_type(layout); + let num_of_temps = + if layout.size > bx.pointer_size() { NumOfTemps::Single } else { NumOfTemps::Two }; + make_swaps_loop( + bx, + x_ptr, + y_ptr, + ToSwap::Iterations(count), + ChunkInfo::RealTyChunk(back_ty, layout.size), + num_of_temps, + layout.align.abi, + ); + return; + } + + let chunk_size = if bx.cx().target_spec().arch == "x86_64" { + // x86_64 allows unaligned reads/writes + // and it is relatively fast + // so try largest chunk available. + const INT_SIZES: [u64; 4] = [1, 2, 4, 8]; + INT_SIZES + .into_iter() + .map(Size::from_bytes) + .take_while(|x| *x <= layout.size) + .filter(|x| layout.size.bytes() % x.bytes() == 0) + .last() + .unwrap() + } else { + // Fallback to integer with size equal to alignment + Size::from_bytes(layout.align.abi.bytes()) + }; + + let chunks_per_elem = layout.size.bytes() / chunk_size.bytes(); + assert_ne!(chunks_per_elem, 0); + let iterations = if chunks_per_elem == 1 { + count + } else { + let chunks_per_elem = bx.const_usize(chunks_per_elem); + bx.unchecked_umul(count, chunks_per_elem) + }; + + make_swaps_loop( + bx, + x_ptr, + y_ptr, + ToSwap::Iterations(iterations), + ChunkInfo::IntChunk(chunk_size), + NumOfTemps::Two, + // It iterates either by chunks equal to alignment + // or multiply of alignment so it would always be correct. + layout.align.abi, + ); + } + + fn choose_int_by_size<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( + bx: &mut Bx, + size: Size, + ) -> Bx::Type { + match size.bits() { + 8 => bx.type_i8(), + 16 => bx.type_i16(), + 32 => bx.type_i32(), + 64 => bx.type_i64(), + 128 => bx.type_i128(), + _ => unreachable!("Unexpected target int {:?}.", size), + } + } + + #[derive(Clone, Copy)] + enum ToSwap { + /// Size of region to swap. Useful when we know exact value. + Bytes(Size), + /// Number of chunks to swap. For runtime value. + Iterations(BxValue), + } + + #[derive(Clone, Copy)] + enum ChunkInfo { + /// When we want to use it directly + RealTyChunk(BxType, Size), + /// When we want to split value by integer chunk. + IntChunk(Size), + } + + #[derive(Copy, Clone, Eq, PartialEq)] + enum NumOfTemps { + Single, + Two, + } + + fn make_swaps_loop<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( + bx: &mut Bx, + x_ptr: Bx::Value, + y_ptr: Bx::Value, + to_swap: ToSwap, + chunk_info: ChunkInfo, + num_of_temps: NumOfTemps, + access_align: Align, + ) { + let (ChunkInfo::IntChunk(chunk_size) | ChunkInfo::RealTyChunk(_, chunk_size)) = chunk_info; + + assert_ne!(chunk_size, Size::ZERO); + + if let ToSwap::Bytes(total_bytes) = to_swap { + assert!( + total_bytes > chunk_size, + "No need to generate loop when simple swap is enough." + ); + assert_eq!( + total_bytes.bytes() % chunk_size.bytes(), + 0, + "Cannot split size of swap into chunks." + ); + } + + assert_eq!( + chunk_size.bytes() % access_align.bytes(), + 0, + "Ensure that access align doesn't shift", + ); + + let chunk_ty = match chunk_info { + ChunkInfo::RealTyChunk(ty, _) => ty, + ChunkInfo::IntChunk(size) => choose_int_by_size(bx, size), + }; + + let iterations = match to_swap { + ToSwap::Bytes(s) => { + let iterations_val = s.bytes() / chunk_size.bytes(); + bx.const_usize(iterations_val) + } + ToSwap::Iterations(it) => it, + }; + + // Need to do pointercasts because `rustc_codegen_gcc` ignores passed type + // in `inbounds_gep`. + let x_ptr = bx.pointercast(x_ptr, bx.type_i8p()); + let y_ptr = bx.pointercast(y_ptr, bx.type_i8p()); + bx.make_memory_loop( + "swap_loop", + [x_ptr, y_ptr], + [chunk_size; 2], + iterations, + |body_bx, &[curr_x_ptr, curr_y_ptr]| match num_of_temps { + NumOfTemps::Single => swap_using_single_temp( + body_bx, + curr_x_ptr, + curr_y_ptr, + chunk_ty, + chunk_size, + access_align, + ), + NumOfTemps::Two => { + swap_using_2_temps(body_bx, curr_x_ptr, curr_y_ptr, chunk_ty, access_align) + } + }, + ); + } + + fn swap_using_2_temps<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( + bx: &mut Bx, + x_ptr: Bx::Value, + y_ptr: Bx::Value, + tmp_ty: Bx::Type, + access_align: Align, + ) { + // FIXME: Remove pointercast when stop support of LLVM 14. + let tmp_ptr_ty = bx.type_ptr_to(tmp_ty); + let x_ptr = bx.pointercast(x_ptr, tmp_ptr_ty); + let y_ptr = bx.pointercast(y_ptr, tmp_ptr_ty); + + let tmp_x = bx.load(tmp_ty, x_ptr, access_align); + let tmp_y = bx.load(tmp_ty, y_ptr, access_align); + bx.store(tmp_y, x_ptr, access_align); + bx.store(tmp_x, y_ptr, access_align); + } + + fn swap_using_single_temp<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( + bx: &mut Bx, + x_ptr: Bx::Value, + y_ptr: Bx::Value, + tmp_ty: Bx::Type, + tmp_size: Size, + access_align: Align, + ) { + // FIXME: Remove pointercast when stop support of LLVM 14. + let tmp_ptr_ty = bx.type_ptr_to(tmp_ty); + let x_ptr = bx.pointercast(x_ptr, tmp_ptr_ty); + let y_ptr = bx.pointercast(y_ptr, tmp_ptr_ty); + + let num_bytes = bx.const_usize(tmp_size.bytes()); + let tmp_x = bx.load(tmp_ty, x_ptr, access_align); + // FIXME(AngelicosPhosphoros): Use memcpy.inline here. + bx.memcpy(x_ptr, access_align, y_ptr, access_align, num_bytes, MemFlags::empty()); + bx.store(tmp_x, y_ptr, access_align); + } +} + fn memset_intrinsic<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( bx: &mut Bx, volatile: bool, @@ -154,6 +590,27 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> { ); return; } + sym::swap_nonoverlapping_single => { + swap_intrinsic::single( + bx, + args[0].immediate(), + args[1].immediate(), + substs.type_at(0), + span, + ); + return; + } + sym::swap_nonoverlapping_many => { + swap_intrinsic::many( + bx, + args[0].immediate(), + args[1].immediate(), + args[2].immediate(), + substs.type_at(0), + span, + ); + return; + } sym::write_bytes => { memset_intrinsic( bx, diff --git a/compiler/rustc_codegen_ssa/src/traits/builder.rs b/compiler/rustc_codegen_ssa/src/traits/builder.rs index 853c6934c2c24..a8fea579875bc 100644 --- a/compiler/rustc_codegen_ssa/src/traits/builder.rs +++ b/compiler/rustc_codegen_ssa/src/traits/builder.rs @@ -257,6 +257,21 @@ pub trait BuilderMethods<'a, 'tcx>: flags: MemFlags, ); + /// Loop that iterates over some memory using offsets steps. + /// Interprets pointers as u8 pointers. + /// `BodyPtrsVisitor` allow access to body and current iteration pointers. + /// Steps MUST not be zeros. + /// `steps[i]*iterations` MUST not overflow targets `usize`. + fn make_memory_loop( + &mut self, + loop_name: &str, + start_ptrs: [Self::Value; VAR_COUNT], + steps: [Size; VAR_COUNT], + iterations: Self::Value, + visitor: BodyPtrsVisitor, + ) where + BodyPtrsVisitor: FnOnce(&mut Self, &[Self::Value; VAR_COUNT]); + fn select( &mut self, cond: Self::Value, diff --git a/compiler/rustc_const_eval/src/interpret/intrinsics.rs b/compiler/rustc_const_eval/src/interpret/intrinsics.rs index 7192bbc00d556..3ae610e2954e2 100644 --- a/compiler/rustc_const_eval/src/interpret/intrinsics.rs +++ b/compiler/rustc_const_eval/src/interpret/intrinsics.rs @@ -291,6 +291,24 @@ impl<'mir, 'tcx: 'mir, M: Machine<'mir, 'tcx>> InterpCx<'mir, 'tcx, M> { sym::write_bytes => { self.write_bytes_intrinsic(&args[0], &args[1], &args[2])?; } + sym::swap_nonoverlapping_single => { + let layout = self.layout_of(substs.type_at(0))?; + self.mem_swap_nonoverlapping( + self.read_pointer(&args[0])?, + self.read_pointer(&args[1])?, + 1, + layout, + )?; + } + sym::swap_nonoverlapping_many => { + let layout = self.layout_of(substs.type_at(0))?; + self.mem_swap_nonoverlapping( + self.read_pointer(&args[0])?, + self.read_pointer(&args[1])?, + self.read_target_usize(&args[2])?, + layout, + )?; + } sym::arith_offset => { let ptr = self.read_pointer(&args[0])?; let offset_count = self.read_target_isize(&args[1])?; diff --git a/compiler/rustc_const_eval/src/interpret/memory.rs b/compiler/rustc_const_eval/src/interpret/memory.rs index 1125d8d1f0e08..eb33c1f4fdb93 100644 --- a/compiler/rustc_const_eval/src/interpret/memory.rs +++ b/compiler/rustc_const_eval/src/interpret/memory.rs @@ -21,10 +21,10 @@ use rustc_target::abi::{Align, HasDataLayout, Size}; use crate::const_eval::CheckAlignment; use crate::fluent_generated as fluent; +use super::alloc_range; use super::{ - alloc_range, AllocBytes, AllocId, AllocMap, AllocRange, Allocation, CheckInAllocMsg, - GlobalAlloc, InterpCx, InterpResult, Machine, MayLeak, Pointer, PointerArithmetic, Provenance, - Scalar, + AllocBytes, AllocId, AllocMap, AllocRange, Allocation, CheckInAllocMsg, GlobalAlloc, InterpCx, + InterpResult, Machine, MayLeak, Pointer, PointerArithmetic, Provenance, Scalar, }; #[derive(Debug, PartialEq, Copy, Clone)] @@ -1222,6 +1222,59 @@ impl<'mir, 'tcx: 'mir, M: Machine<'mir, 'tcx>> InterpCx<'mir, 'tcx, M> { Ok(()) } + + pub fn mem_swap_nonoverlapping( + &mut self, + x_ptr: Pointer>, + y_ptr: Pointer>, + count: u64, + layout: ty::layout::TyAndLayout<'tcx>, + ) -> InterpResult<'tcx> { + let elem_size = layout.size; + let align = layout.align.abi; + + if count > i64::MAX as u64 { + throw_ub_format!("`count` argument to `swap_nonoverlapping_many` is too large."); + } + + let first_ptr_acc = self.get_ptr_access(x_ptr, elem_size * count, align)?; + let second_ptr_acc = self.get_ptr_access(y_ptr, elem_size * count, align)?; + + let Some((x_alloc_id, x_offset, _)) = first_ptr_acc else { + assert_eq!(elem_size, Size::ZERO); + // Called on ZST so it is noop. + return Ok(()) + }; + let Some((y_alloc_id, y_offset, _)) = second_ptr_acc else { + unreachable!("If right param is ZST, left must be too") + }; + + if x_alloc_id == y_alloc_id { + if (x_offset..x_offset + elem_size * count).contains(&y_offset) + || (y_offset..y_offset + elem_size * count).contains(&x_offset) + { + throw_ub_format!("swap was called on overlapping memory."); + } + } + + if count == 0 { + return Ok(()); + } + + let tmp_stack_alloc = self.allocate(layout, MemoryKind::Stack)?; + + for i in 0..i64::try_from(count).unwrap() { + let curr_x_ptr = self.ptr_offset_inbounds(x_ptr, layout.ty, i)?; + let curr_y_ptr = self.ptr_offset_inbounds(y_ptr, layout.ty, i)?; + + self.mem_copy(curr_x_ptr, align, tmp_stack_alloc.ptr, align, elem_size, true)?; + self.mem_copy(curr_y_ptr, align, curr_x_ptr, align, elem_size, true)?; + self.mem_copy(tmp_stack_alloc.ptr, align, curr_y_ptr, align, elem_size, true)?; + } + self.deallocate_ptr(tmp_stack_alloc.ptr, Some((elem_size, align)), MemoryKind::Stack)?; + + Ok(()) + } } /// Machine pointer introspection. diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs index 36c468e778986..b93bc57e2002e 100644 --- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs +++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs @@ -278,6 +278,23 @@ pub fn check_intrinsic_type(tcx: TyCtxt<'_>, it: &hir::ForeignItem<'_>) { ], tcx.mk_unit(), ), + sym::swap_nonoverlapping_single => ( + 1, + vec![ + tcx.mk_ptr(ty::TypeAndMut { ty: param(0), mutbl: hir::Mutability::Mut }), + tcx.mk_ptr(ty::TypeAndMut { ty: param(0), mutbl: hir::Mutability::Mut }), + ], + tcx.mk_unit(), + ), + sym::swap_nonoverlapping_many => ( + 1, + vec![ + tcx.mk_ptr(ty::TypeAndMut { ty: param(0), mutbl: hir::Mutability::Mut }), + tcx.mk_ptr(ty::TypeAndMut { ty: param(0), mutbl: hir::Mutability::Mut }), + tcx.types.usize, + ], + tcx.mk_unit(), + ), sym::sqrtf32 => (0, vec![tcx.types.f32], tcx.types.f32), sym::sqrtf64 => (0, vec![tcx.types.f64], tcx.types.f64), sym::powif32 => (0, vec![tcx.types.f32, tcx.types.i32], tcx.types.f32), diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index c5ce2575fff06..aacee6a44ce1f 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -1480,6 +1480,8 @@ symbols! { sub_assign, sub_with_overflow, suggestion, + swap_nonoverlapping_many, + swap_nonoverlapping_single, sym, sync, t32, diff --git a/library/core/src/intrinsics.rs b/library/core/src/intrinsics.rs index 9b8612485ac1e..7eb145e99e73c 100644 --- a/library/core/src/intrinsics.rs +++ b/library/core/src/intrinsics.rs @@ -2768,6 +2768,67 @@ pub const unsafe fn copy(src: *const T, dst: *mut T, count: usize) { } } +#[cfg(not(bootstrap))] +extern "rust-intrinsic" { + /// This is an implementation detail of [`crate::mem::swap`] and should + /// not be used anywhere else. + /// + /// Swaps 2 values using minimal extra memory depending on target. + /// Created to remove target/backend specific optimizations from library code to + /// make MIR-level optimizations simpler to implement. + /// + /// The operation is "untyped" in the sense that data may be uninitialized or otherwise violate the + /// requirements of `T`. The initialization state is preserved exactly. + /// + /// # Safety + /// + /// Behavior is undefined if any of the following conditions are violated: + /// + /// * Both `x` and `y` must be valid for both reads and writes of `size_of::()` bytes. + /// + /// * Both `x` and `y` must be properly aligned. + /// + /// * The region of memory beginning at `x` with a size of `size_of::()` + /// bytes must *not* overlap with the region of memory beginning at `y` + /// with the same size. + /// + /// Note that even if the effectively copied size (`size_of::()`) is `0`, + /// the pointers must be non-null and properly aligned. + #[rustc_nounwind] + #[rustc_const_unstable(feature = "const_swap", issue = "83163")] + pub fn swap_nonoverlapping_single(x: *mut T, y: *mut T); + + /// This is an implementation detail of [`crate::ptr::swap_nonoverlapping`] and should + /// not be used anywhere else. + /// + /// Swaps 2 ranges of values starting from `x` and `y` using minimal extra memory depending on target. + /// Created to remove target/backend specific optimizations from library code to + /// make MIR-level optimizations simpler to implement. + /// + /// The operation is "untyped" in the sense that data may be uninitialized or otherwise violate the + /// requirements of `T`. The initialization state is preserved exactly. + /// + /// # Safety + /// + /// Behavior is undefined if any of the following conditions are violated: + /// + /// * Both `x` and `y` must be valid for both reads and writes of `count * + /// size_of::()` bytes. + /// + /// * Both `x` and `y` must be properly aligned. + /// + /// * The region of memory beginning at `x` with a size of `count * + /// size_of::()` bytes must *not* overlap with the region of memory + /// beginning at `y` with the same size. + /// + /// Note that even if the effectively copied size (`count * size_of::()`) is `0`, + /// the pointers must be non-null and properly aligned. + /// + #[rustc_nounwind] + #[rustc_const_unstable(feature = "const_swap", issue = "83163")] + pub fn swap_nonoverlapping_many(x: *mut T, y: *mut T, count: usize); +} + /// Sets `count * size_of::()` bytes of memory starting at `dst` to /// `val`. /// diff --git a/library/core/src/mem/mod.rs b/library/core/src/mem/mod.rs index 39c9a04eea92b..9b374745cd3c5 100644 --- a/library/core/src/mem/mod.rs +++ b/library/core/src/mem/mod.rs @@ -724,40 +724,52 @@ pub unsafe fn uninitialized() -> T { #[stable(feature = "rust1", since = "1.0.0")] #[rustc_const_unstable(feature = "const_swap", issue = "83163")] pub const fn swap(x: &mut T, y: &mut T) { - // NOTE(eddyb) SPIR-V's Logical addressing model doesn't allow for arbitrary - // reinterpretation of values as (chunkable) byte arrays, and the loop in the - // block optimization in `swap_slice` is hard to rewrite back - // into the (unoptimized) direct swapping implementation, so we disable it. - // FIXME(eddyb) the block optimization also prevents MIR optimizations from - // understanding `mem::replace`, `Option::take`, etc. - a better overall - // solution might be to make `ptr::swap_nonoverlapping` into an intrinsic, which - // a backend can choose to implement using the block optimization, or not. - #[cfg(not(any(target_arch = "spirv")))] + #[cfg(bootstrap)] { - // For types that are larger multiples of their alignment, the simple way - // tends to copy the whole thing to stack rather than doing it one part - // at a time, so instead treat them as one-element slices and piggy-back - // the slice optimizations that will split up the swaps. - if size_of::() / align_of::() > 4 { - // SAFETY: exclusive references always point to one non-overlapping - // element and are non-null and properly aligned. - return unsafe { ptr::swap_nonoverlapping(x, y, 1) }; + // NOTE(eddyb) SPIR-V's Logical addressing model doesn't allow for arbitrary + // reinterpretation of values as (chunkable) byte arrays, and the loop in the + // block optimization in `swap_slice` is hard to rewrite back + // into the (unoptimized) direct swapping implementation, so we disable it. + // FIXME(eddyb) the block optimization also prevents MIR optimizations from + // understanding `mem::replace`, `Option::take`, etc. - a better overall + // solution might be to make `ptr::swap_nonoverlapping` into an intrinsic, which + // a backend can choose to implement using the block optimization, or not. + #[cfg(not(any(target_arch = "spirv")))] + { + // For types that are larger multiples of their alignment, the simple way + // tends to copy the whole thing to stack rather than doing it one part + // at a time, so instead treat them as one-element slices and piggy-back + // the slice optimizations that will split up the swaps. + if size_of::() / align_of::() > 4 { + // SAFETY: exclusive references always point to one non-overlapping + // element and are non-null and properly aligned. + return unsafe { ptr::swap_nonoverlapping(x, y, 1) }; + } } - } - // If a scalar consists of just a small number of alignment units, let - // the codegen just swap those pieces directly, as it's likely just a - // few instructions and anything else is probably overcomplicated. - // - // Most importantly, this covers primitives and simd types that tend to - // have size=align where doing anything else can be a pessimization. - // (This will also be used for ZSTs, though any solution works for them.) - swap_simple(x, y); + // If a scalar consists of just a small number of alignment units, let + // the codegen just swap those pieces directly, as it's likely just a + // few instructions and anything else is probably overcomplicated. + // + // Most importantly, this covers primitives and simd types that tend to + // have size=align where doing anything else can be a pessimization. + // (This will also be used for ZSTs, though any solution works for them.) + swap_simple(x, y); + } + #[cfg(not(bootstrap))] + // SAFETY: since `x` and `y` are mutable references, + // 1. `x` and `y` are initialized. + // 2. `x` and `y` cannot overlap. + // 3. `x` and `y` are aligned. + unsafe { + core::intrinsics::swap_nonoverlapping_single(x, y); + } } /// Same as [`swap`] semantically, but always uses the simple implementation. /// /// Used elsewhere in `mem` and `ptr` at the bottom layer of calls. +#[cfg(bootstrap)] #[rustc_const_unstable(feature = "const_swap", issue = "83163")] #[inline] pub(crate) const fn swap_simple(x: &mut T, y: &mut T) { diff --git a/library/core/src/ptr/mod.rs b/library/core/src/ptr/mod.rs index acc9ca29d41a1..a6de56da986ae 100644 --- a/library/core/src/ptr/mod.rs +++ b/library/core/src/ptr/mod.rs @@ -908,25 +908,6 @@ pub const unsafe fn swap(x: *mut T, y: *mut T) { #[stable(feature = "swap_nonoverlapping", since = "1.27.0")] #[rustc_const_unstable(feature = "const_swap", issue = "83163")] pub const unsafe fn swap_nonoverlapping(x: *mut T, y: *mut T, count: usize) { - #[allow(unused)] - macro_rules! attempt_swap_as_chunks { - ($ChunkTy:ty) => { - if mem::align_of::() >= mem::align_of::<$ChunkTy>() - && mem::size_of::() % mem::size_of::<$ChunkTy>() == 0 - { - let x: *mut $ChunkTy = x.cast(); - let y: *mut $ChunkTy = y.cast(); - let count = count * (mem::size_of::() / mem::size_of::<$ChunkTy>()); - // SAFETY: these are the same bytes that the caller promised were - // ok, just typed as `MaybeUninit`s instead of as `T`s. - // The `if` condition above ensures that we're not violating - // alignment requirements, and that the division is exact so - // that we don't lose any bytes off the end. - return unsafe { swap_nonoverlapping_simple_untyped(x, y, count) }; - } - }; - } - // SAFETY: the caller must guarantee that `x` and `y` are // valid for writes and properly aligned. unsafe { @@ -940,19 +921,16 @@ pub const unsafe fn swap_nonoverlapping(x: *mut T, y: *mut T, count: usize) { ); } - // Split up the slice into small power-of-two-sized chunks that LLVM is able - // to vectorize (unless it's a special type with more-than-pointer alignment, - // because we don't want to pessimize things like slices of SIMD vectors.) - if mem::align_of::() <= mem::size_of::() - && (!mem::size_of::().is_power_of_two() - || mem::size_of::() > mem::size_of::() * 2) - { - attempt_swap_as_chunks!(usize); - attempt_swap_as_chunks!(u8); + #[cfg(bootstrap)] + // SAFETY: Same preconditions as this function + unsafe { + swap_nonoverlapping_simple_untyped(x, y, count) } - + #[cfg(not(bootstrap))] // SAFETY: Same preconditions as this function - unsafe { swap_nonoverlapping_simple_untyped(x, y, count) } + unsafe { + intrinsics::swap_nonoverlapping_many(x, y, count) + } } /// Same behaviour and safety conditions as [`swap_nonoverlapping`] @@ -960,6 +938,7 @@ pub const unsafe fn swap_nonoverlapping(x: *mut T, y: *mut T, count: usize) { /// LLVM can vectorize this (at least it can for the power-of-two-sized types /// `swap_nonoverlapping` tries to use) so no need to manually SIMD it. #[inline] +#[cfg(bootstrap)] #[rustc_const_unstable(feature = "const_swap", issue = "83163")] const unsafe fn swap_nonoverlapping_simple_untyped(x: *mut T, y: *mut T, count: usize) { let x = x.cast::>(); diff --git a/library/core/tests/mem.rs b/library/core/tests/mem.rs index 5c2e18745ea21..251ebb274c520 100644 --- a/library/core/tests/mem.rs +++ b/library/core/tests/mem.rs @@ -103,6 +103,92 @@ fn test_swap() { assert_eq!(y, 31337); } +#[test] +fn test_many() { + // This tests if chunking works properly + fn swap_sized(a: T, b: T) { + let mut x: [T; SIZE] = [a; SIZE]; + let mut y: [T; SIZE] = [b; SIZE]; + swap::<[T; SIZE]>(&mut x, &mut y); + assert_eq!(x, [b; SIZE]); + assert_eq!(y, [a; SIZE]); + } + + fn swap_t(a: T, b: T) { + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + } + + swap_t::(7, 0xFF); + swap_t::(0xFAFA, 0x9898); + swap_t::(0xF0F0_F0F0, 0x0E0E_0E0E); + swap_t::(7, 8); + + #[derive(Eq, PartialEq, Debug)] + #[repr(align(32))] + struct LargeAlign([u8; 32]); + + let mut x = LargeAlign([9; 32]); + let mut y = LargeAlign([20; 32]); + swap(&mut x, &mut y); + assert_eq!(x, LargeAlign([20; 32])); + assert_eq!(y, LargeAlign([9; 32])); + + #[derive(Eq, PartialEq, Debug)] + #[repr(align(32))] + struct LargeAlignAndSize([u8; 96]); + + let mut x = LargeAlignAndSize([9; 96]); + let mut y = LargeAlignAndSize([20; 96]); + swap(&mut x, &mut y); + assert_eq!(x, LargeAlignAndSize([20; 96])); + assert_eq!(y, LargeAlignAndSize([9; 96])); + + #[derive(Eq, PartialEq, Debug)] + struct WithPadding { + a: u16, + b: u64, + } + + let mut x = WithPadding { a: 7, b: 27 }; + let mut y = WithPadding { a: 77, b: u64::MAX }; + swap(&mut x, &mut y); + assert_eq!(x, WithPadding { a: 77, b: u64::MAX }); + assert_eq!(y, WithPadding { a: 7, b: 27 }); +} + #[test] fn test_replace() { let mut x = Some("test".to_string()); diff --git a/library/core/tests/ptr.rs b/library/core/tests/ptr.rs index c02cd99cc4477..80837f040039b 100644 --- a/library/core/tests/ptr.rs +++ b/library/core/tests/ptr.rs @@ -1088,6 +1088,28 @@ fn swap_copy_untyped() { assert_eq!(y, 5); } +#[test] +fn test_swap_unaligned_on_x86_64() { + #[derive(Copy, Clone, Eq, PartialEq, Debug)] + struct AlignedTo2([u16; 4]); + + assert!( + mem::size_of::() >= mem::size_of::() + && mem::align_of::() == 2 + && mem::align_of::() < mem::align_of::() + ); + + let buff0: &mut [_] = &mut [AlignedTo2([1, 2, 3, 4]); 20]; + let buff1: &mut [_] = &mut [AlignedTo2([5, 6, 7, 8]); 20]; + let len = 20; + + unsafe { + swap_nonoverlapping(buff0.as_mut_ptr(), buff1.as_mut_ptr(), read_volatile(&len)); + } + assert_eq!(buff0, &[AlignedTo2([5, 6, 7, 8]); 20]); + assert_eq!(buff1, &[AlignedTo2([1, 2, 3, 4]); 20]); +} + #[test] fn test_const_copy() { const { diff --git a/tests/codegen/swap-large-types.rs b/tests/codegen/swap-large-types.rs index 4a68403578d1e..69caa87514f29 100644 --- a/tests/codegen/swap-large-types.rs +++ b/tests/codegen/swap-large-types.rs @@ -16,7 +16,7 @@ type KeccakBuffer = [[u64; 5]; 5]; // CHECK-LABEL: @swap_basic #[no_mangle] pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) { -// CHECK: alloca [5 x [5 x i64]] + // CHECK: alloca [5 x [5 x i64]] // SAFETY: exclusive references are always valid to read/write, // are non-overlapping, and nothing here panics so it's drop-safe. @@ -33,9 +33,14 @@ pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) { // CHECK-LABEL: @swap_std #[no_mangle] pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) { -// CHECK-NOT: alloca -// CHECK: load <{{[0-9]+}} x i64> -// CHECK: store <{{[0-9]+}} x i64> + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: load <{{[0-9]+}} x i64>{{.*}}align 8 + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: store <{{[0-9]+}} x i64>{{.*}}align 8 + // CHECK-NOT: alloca + // CHECK-NOT: br swap(x, y) } @@ -45,9 +50,11 @@ pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) { // CHECK-LABEL: @swap_slice #[no_mangle] pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) { -// CHECK-NOT: alloca -// CHECK: load <{{[0-9]+}} x i64> -// CHECK: store <{{[0-9]+}} x i64> + // CHECK-NOT: alloca + // CHECK: load <{{[0-9]+}} x i{{8|16|32|64}}>{{.*}}align 8 + // CHECK-NOT: alloca + // CHECK: store <{{[0-9]+}} x i{{8|16|32|64}}>{{.*}}align 8 + // CHECK-NOT: alloca if x.len() == y.len() { x.swap_with_slice(y); } @@ -60,32 +67,24 @@ type OneKilobyteBuffer = [u8; 1024]; // CHECK-LABEL: @swap_1kb_slices #[no_mangle] pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer]) { -// CHECK-NOT: alloca -// CHECK: load <{{[0-9]+}} x i8> -// CHECK: store <{{[0-9]+}} x i8> + // CHECK-NOT: alloca + // CHECK: load <{{[0-9]+}} x i{{8|16|32|64}}> + // CHECK: store <{{[0-9]+}} x i{{8|16|32|64}}> + // CHECK-NOT: alloca if x.len() == y.len() { x.swap_with_slice(y); } } -// This verifies that the 2×read + 2×write optimizes to just 3 memcpys -// for an unusual type like this. It's not clear whether we should do anything -// smarter in Rust for these, so for now it's fine to leave these up to the backend. -// That's not as bad as it might seem, as for example, LLVM will lower the -// memcpys below to VMOVAPS on YMMs if one enables the AVX target feature. -// Eventually we'll be able to pass `align_of::` to a const generic and -// thus pick a smarter chunk size ourselves without huge code duplication. - #[repr(align(64))] pub struct BigButHighlyAligned([u8; 64 * 3]); // CHECK-LABEL: @swap_big_aligned #[no_mangle] pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) { -// CHECK-NOT: call void @llvm.memcpy -// CHECK: call void @llvm.memcpy.{{.+}}({{i8\*|ptr}} noundef nonnull align 64 dereferenceable(192) -// CHECK: call void @llvm.memcpy.{{.+}}({{i8\*|ptr}} noundef nonnull align 64 dereferenceable(192) -// CHECK: call void @llvm.memcpy.{{.+}}({{i8\*|ptr}} noundef nonnull align 64 dereferenceable(192) -// CHECK-NOT: call void @llvm.memcpy + // CHECK-NOT: alloca + // CHECK-NOT: call void @llvm.memcpy + // CHECK: call void @llvm.memcpy.{{.+}}({{i8\*|ptr}} noundef nonnull align 64 dereferenceable(192) + // CHECK-NOT: call void @llvm.memcpy swap(x, y) } diff --git a/tests/codegen/swap-simd-types.rs b/tests/codegen/swap-simd-types.rs index 3472a42b0e65e..5d702144b3973 100644 --- a/tests/codegen/swap-simd-types.rs +++ b/tests/codegen/swap-simd-types.rs @@ -1,4 +1,4 @@ -// compile-flags: -O -C target-feature=+avx +// compile-flags: -Copt-level=3 -C target-feature=+avx // only-x86_64 // ignore-debug: the debug assertions get in the way @@ -35,7 +35,7 @@ pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) { #[no_mangle] pub fn swap_bytes32(x: &mut [u8; 32], y: &mut [u8; 32]) { // CHECK-NOT: alloca -// CHECK: load <32 x i8>{{.+}}align 1 -// CHECK: store <32 x i8>{{.+}}align 1 +// CHECK: load <4 x i64>{{.+}}align 1 +// CHECK: store <4 x i64>{{.+}}align 1 swap(x, y) } diff --git a/tests/codegen/swap-small-types.rs b/tests/codegen/swap-small-types.rs index 419645a3fc6bc..03dd4f7a5dddd 100644 --- a/tests/codegen/swap-small-types.rs +++ b/tests/codegen/swap-small-types.rs @@ -1,8 +1,9 @@ -// compile-flags: -O -Z merge-functions=disabled +// compile-flags: -Copt-level=3 -Z merge-functions=disabled // only-x86_64 // ignore-debug: the debug assertions get in the way #![crate_type = "lib"] +#![feature(portable_simd)] use std::mem::swap; @@ -26,10 +27,61 @@ pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) { #[no_mangle] pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) { // CHECK-NOT: alloca - // CHECK: load <3 x i16> - // CHECK: load <3 x i16> - // CHECK: store <3 x i16> - // CHECK: store <3 x i16> + // CHECK-NOT: br + // CHECK: load i32 + + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: store i32 + + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: load i16 + + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: store i16 + + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: ret void + swap(x, y) +} + +// CHECK-LABEL: @swap_vecs +#[no_mangle] +pub fn swap_vecs(x: &mut Vec, y: &mut Vec) { + // CHECK-NOT: alloca + // CHECK-NOT: br + + // CHECK: load <{{[0-9]+}} x i64> + // CHECK-NOT: alloca + // CHECK-NOT: br + + // CHECK: store <{{[0-9]+}} x i64> + // CHECK-NOT: alloca + // CHECK-NOT: br + + // CHECK: load i64 + // CHECK-NOT: alloca + // CHECK-NOT: br + + // CHECK: store i64 + // CHECK-NOT: alloca + // CHECK-NOT: br + + // CHECK: ret void + swap(x, y) +} + +// CHECK-LABEL: @swap_slices +#[no_mangle] +pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) { + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: load <{{[0-9]+}} x i64> + // CHECK: store <{{[0-9]+}} x i64> + // CHECK: ret void swap(x, y) } @@ -40,23 +92,23 @@ type RGB24 = [u8; 3]; // CHECK-LABEL: @swap_rgb24_slices #[no_mangle] pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) { -// CHECK-NOT: alloca -// CHECK: load <{{[0-9]+}} x i8> -// CHECK: store <{{[0-9]+}} x i8> + // CHECK-NOT: alloca + // CHECK: load <{{[0-9]+}} x i8> + // CHECK: store <{{[0-9]+}} x i8> if x.len() == y.len() { x.swap_with_slice(y); } } -// This one has a power-of-two size, so we iterate over it directly +// This one has a power-of-two size, so we iterate over it using ints. type RGBA32 = [u8; 4]; // CHECK-LABEL: @swap_rgba32_slices #[no_mangle] pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) { -// CHECK-NOT: alloca -// CHECK: load <{{[0-9]+}} x i32> -// CHECK: store <{{[0-9]+}} x i32> + // CHECK-NOT: alloca + // CHECK: load <{{[0-9]+}} x i32> + // CHECK: store <{{[0-9]+}} x i32> if x.len() == y.len() { x.swap_with_slice(y); } @@ -69,10 +121,38 @@ const _: () = assert!(!std::mem::size_of::().is_power_of_two()); // CHECK-LABEL: @swap_string_slices #[no_mangle] pub fn swap_string_slices(x: &mut [String], y: &mut [String]) { -// CHECK-NOT: alloca -// CHECK: load <{{[0-9]+}} x i64> -// CHECK: store <{{[0-9]+}} x i64> + // CHECK-NOT: alloca + // CHECK: load <{{[0-9]+}} x i64> + // CHECK: store <{{[0-9]+}} x i64> if x.len() == y.len() { x.swap_with_slice(y); } } + +#[repr(C, packed)] +pub struct Packed { + pub first: bool, + pub second: u64, +} + +// CHECK-LABEL: @swap_packed_structs +#[no_mangle] +pub fn swap_packed_structs(x: &mut Packed, y: &mut Packed) { + // CHECK-NOT: alloca + // CHECK: ret void + swap(x, y) +} + +// CHECK-LABEL: @swap_simd_type +#[no_mangle] +pub fn swap_simd_type(x: &mut std::simd::f32x4, y: &mut std::simd::f32x4){ + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: load <4 x float> + + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: store <4 x float> + // CHECK: ret void + swap(x, y) +} diff --git a/tests/ui/consts/missing_span_in_backtrace.stderr b/tests/ui/consts/missing_span_in_backtrace.stderr index fcfb9fbb3f8c0..1535777326566 100644 --- a/tests/ui/consts/missing_span_in_backtrace.stderr +++ b/tests/ui/consts/missing_span_in_backtrace.stderr @@ -3,12 +3,6 @@ error[E0080]: evaluation of constant value failed | = note: unable to copy parts of a pointer from memory at ALLOC_ID | -note: inside `std::ptr::read::>>` - --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL -note: inside `mem::swap_simple::>>` - --> $SRC_DIR/core/src/mem/mod.rs:LL:COL -note: inside `ptr::swap_nonoverlapping_simple_untyped::>` - --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL note: inside `swap_nonoverlapping::>` --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL note: inside `X` diff --git a/tests/ui/intrinsics/swap_nonoverlapping_single.rs b/tests/ui/intrinsics/swap_nonoverlapping_single.rs new file mode 100644 index 0000000000000..64afeceeb7ca8 --- /dev/null +++ b/tests/ui/intrinsics/swap_nonoverlapping_single.rs @@ -0,0 +1,132 @@ +#![feature(core_intrinsics, const_mut_refs, const_swap)] +#![crate_type = "rlib"] + +//! This module tests if `swap_nonoverlapping_single` works properly in const contexts. + +use std::intrinsics::swap_nonoverlapping_single; + +pub const OK_A: () = { + let mut a = 0i32; + let mut b = 5i32; + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + assert!(a == 5, "Must NOT fail."); + assert!(b == 0, "Must NOT fail."); +}; + +pub const ERR_A0: () = { + let mut a = 0i32; + let mut b = 5i32; + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + + assert!(a != 5, "Must fail."); //~ ERROR evaluation of constant value failed +}; + +pub const ERR_A1: () = { + let mut a = 0i32; + let mut b = 5i32; + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + + assert!(b != 0, "Must fail."); //~ ERROR evaluation of constant value failed +}; + +// This must NOT fail. +pub const B: () = { + let mut a = 0i32; + let mut b = 5i32; + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + assert!(a == 0, "Must NOT fail."); + assert!(b == 5, "Must NOT fail."); +}; + +pub const ERR_B0: () = { + let mut a = 0i32; + let mut b = 5i32; + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + + assert!(a != 0, "Must fail."); //~ ERROR evaluation of constant value failed +}; + +pub const ERR_B1: () = { + let mut a = 0i32; + let mut b = 5i32; + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + + assert!(b != 5, "Must fail."); //~ ERROR evaluation of constant value failed +}; + +// This must NOT fail. +pub const NON_OVERLAPPING_PTRS: () = { + let mut chunk = [0_i32, 1, 2, 3]; + + let ptr = chunk.as_mut_ptr(); + let ptr2 = unsafe { ptr.add(2) }; + let x: &mut [i32; 2] = unsafe { &mut *ptr.cast() }; + let y: &mut [i32; 2] = unsafe { &mut *ptr2.cast() }; + unsafe { + swap_nonoverlapping_single(x, y); + } + + assert!(matches!(chunk, [2, 3, 0, 1]), "Must NOT fail."); +}; + +pub const OVERLAPPING_PTRS_0: () = { + let mut chunk = [0_i32, 1, 2, 3]; + + let ptr = chunk.as_mut_ptr(); + let ptr2 = unsafe { ptr.add(1) }; + let x: &mut [i32; 2] = unsafe { &mut *ptr.cast() }; + let y: &mut [i32; 2] = unsafe { &mut *ptr2.cast() }; + + unsafe { + swap_nonoverlapping_single(x, y); //~ ERROR evaluation of constant value failed + } +}; + +pub const OVERLAPPING_PTRS_1: () = { + let mut val = 7; + + let ptr: *mut _ = &mut val; + let x: &mut i32 = unsafe { &mut *ptr }; + let y: &mut i32 = unsafe { &mut *ptr }; + + unsafe { + swap_nonoverlapping_single(x, y); //~ ERROR evaluation of constant value failed + } +}; + +pub const OK_STRUCT: () = { + struct Adt { + fl: bool, + val: usize, + } + let mut a = Adt { fl: false, val: 10 }; + let mut b = Adt { fl: true, val: 77 }; + + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + + assert!(matches!(a, Adt { fl: true, val: 77 }), "Must NOT fail."); + assert!(matches!(b, Adt { fl: false, val: 10 }), "Must NOT fail."); +}; diff --git a/tests/ui/intrinsics/swap_nonoverlapping_single.stderr b/tests/ui/intrinsics/swap_nonoverlapping_single.stderr new file mode 100644 index 0000000000000..d9a77927ea13a --- /dev/null +++ b/tests/ui/intrinsics/swap_nonoverlapping_single.stderr @@ -0,0 +1,47 @@ +error[E0080]: evaluation of constant value failed + --> $DIR/swap_nonoverlapping_single.rs:25:5 + | +LL | assert!(a != 5, "Must fail."); + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ the evaluated program panicked at 'Must fail.', $DIR/swap_nonoverlapping_single.rs:25:5 + | + = note: this error originates in the macro `$crate::panic::panic_2015` which comes from the expansion of the macro `panic` (in Nightly builds, run with -Z macro-backtrace for more info) + +error[E0080]: evaluation of constant value failed + --> $DIR/swap_nonoverlapping_single.rs:35:5 + | +LL | assert!(b != 0, "Must fail."); + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ the evaluated program panicked at 'Must fail.', $DIR/swap_nonoverlapping_single.rs:35:5 + | + = note: this error originates in the macro `$crate::panic::panic_2015` which comes from the expansion of the macro `panic` (in Nightly builds, run with -Z macro-backtrace for more info) + +error[E0080]: evaluation of constant value failed + --> $DIR/swap_nonoverlapping_single.rs:62:5 + | +LL | assert!(a != 0, "Must fail."); + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ the evaluated program panicked at 'Must fail.', $DIR/swap_nonoverlapping_single.rs:62:5 + | + = note: this error originates in the macro `$crate::panic::panic_2015` which comes from the expansion of the macro `panic` (in Nightly builds, run with -Z macro-backtrace for more info) + +error[E0080]: evaluation of constant value failed + --> $DIR/swap_nonoverlapping_single.rs:75:5 + | +LL | assert!(b != 5, "Must fail."); + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ the evaluated program panicked at 'Must fail.', $DIR/swap_nonoverlapping_single.rs:75:5 + | + = note: this error originates in the macro `$crate::panic::panic_2015` which comes from the expansion of the macro `panic` (in Nightly builds, run with -Z macro-backtrace for more info) + +error[E0080]: evaluation of constant value failed + --> $DIR/swap_nonoverlapping_single.rs:102:9 + | +LL | swap_nonoverlapping_single(x, y); + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ swap was called on overlapping memory. + +error[E0080]: evaluation of constant value failed + --> $DIR/swap_nonoverlapping_single.rs:114:9 + | +LL | swap_nonoverlapping_single(x, y); + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ swap was called on overlapping memory. + +error: aborting due to 6 previous errors + +For more information about this error, try `rustc --explain E0080`.