Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP]Use unaligned read/writes for core::mem::swap on x86_64 #98892

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
251 changes: 245 additions & 6 deletions library/core/src/mem/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -734,14 +734,23 @@ pub const fn swap<T>(x: &mut T, y: &mut T) {
// a backend can choose to implement using the block optimization, or not.
#[cfg(not(any(target_arch = "spirv")))]
{
// Types with alignment bigger than usize are almost always used for
// hand-tuned SIMD optimizations so we don't get into way.
//
// For types that are larger multiples of their alignment, the simple way
// tends to copy the whole thing to stack rather than doing it one part
// at a time, so instead treat them as one-element slices and piggy-back
// the slice optimizations that will split up the swaps.
if size_of::<T>() / align_of::<T>() > 4 {
// SAFETY: exclusive references always point to one non-overlapping
// element and are non-null and properly aligned.
return unsafe { ptr::swap_nonoverlapping(x, y, 1) };
// at a time, so instead try to split them into chunks that fit into registers
// and swap chunks.
if const {
let size = size_of::<T>();
let align = align_of::<T>();
// It is weird that LLVM sometimes optimizes `4*align` fine while failing `3*align`.
// Proof: https://godbolt.org/z/MhnqvjjPz
(align < align_of::<usize>() && (size == 3 * align || size > 4 * align))
// And usize fails even for `4*align`.
|| (align == align_of::<usize>() && size > 2 * align)
} {
return swap_chunked(x, y);
}
}

Expand Down Expand Up @@ -784,6 +793,236 @@ pub(crate) const fn swap_simple<T>(x: &mut T, y: &mut T) {
}
}

// This version swaps 2 values by chunks of usize and smaller
// using unaligned reads and writes because they are cheap
// on modern x86_64 processors for at least 10 years now (at 2022-07-04).
// https://lemire.me/blog/2012/05/31/data-alignment-for-speed-myth-or-reality/
// It generates less instructions and memory accesses as well: https://godbolt.org/z/Mr4rWfoad
// Feel free to add another targets as well if they have fast unaligned accesses.
//
// This should be done by backend optimizer but it currently fails to do so.
#[cfg(target_arch = "x86_64")]
#[rustc_const_unstable(feature = "const_swap", issue = "83163")]
#[inline]
const fn swap_chunked<T: Sized>(x: &mut T, y: &mut T) {
// Algorithm:
// 1. Swap first `n*ZMM_BYTES` using `u64`, which reliably autovectorized by LLVM.
// 2. Force backend to generate use SIMD registers YMM or XMM using `force_swap_simd`.
// 3. Swap remaining bytes using integers each of them twice smaller than previous.

// Note: Current version of this function optimized for x86_64.
// If you allow use of it for another architecture, check generated code first.
const XMM_BYTES: usize = 128 / 8;
const YMM_BYTES: usize = 256 / 8;
const ZMM_BYTES: usize = 512 / 8;

/// This function successfully autovectorizes if `size_bytes` is divisible
/// by biggest available SIMD register size. If it is not, it would generate
/// SIMD operations for first divisible bytes, then would emit just integer reads and writes.
/// # Safety
/// 1. Must not overlap. 2. Must have at least `size_bytes` valid bytes.
/// 3. `size_bytes` must be exactly divisible by `size_of::<u64>`.
#[inline]
const unsafe fn swap_simple_u64_chunks(
size_bytes: usize,
x: *mut MaybeUninit<u8>,
y: *mut MaybeUninit<u8>,
) {
let mut byte_offset = 0;
while byte_offset < size_bytes {
// SAFETY:
// Caller must ensure pointers validity and range.
// We use unaligned reads/writes.
unsafe {
let x = x.add(byte_offset).cast();
let y = y.add(byte_offset).cast();
// Same rationale for 2 reads and 2 writes
// as in `swap_simple`.
let tmp_x: MaybeUninit<u64> = ptr::read_unaligned(x);
let tmp_y: MaybeUninit<u64> = ptr::read_unaligned(y);
ptr::write_unaligned(x, tmp_y);
ptr::write_unaligned(y, tmp_x);
}
byte_offset += const { size_of::<u64>() };
}
}

/// This function would generate swap of next `SIZE_BYTES` using SIMD registers.
/// It has drawback: it would generate swap using `SIZE_BYTES` only.
/// E.g. if type has ZMM_BYTES size, but swap implemented using only `force_swap_simd<XMM_BYTES>`,
/// it would use only XMM registers.
/// So this function should be used only once because if it called twice,
/// it is better to use bigger register.
///
/// It is OK to call it with YMM_BYTES even if only SSE enabled because
/// It would just use 4 XMM registers.
/// # Safety
/// 1. Must not overlap. 2. Must have at least `SIZE_BYTES` valid bytes.
#[inline]
const unsafe fn force_swap_simd<const SIZE_BYTES: usize>(
x: *mut MaybeUninit<u8>,
y: *mut MaybeUninit<u8>,
) {
const {
assert!(
SIZE_BYTES == XMM_BYTES || SIZE_BYTES == YMM_BYTES,
"Must have valid SIMD register size",
);
}
// SAFETY: We require valid non-overlapping pointers with SIZE_BYTES.
// We checked that they SIMD register sized.
unsafe {
// Don't use an array for temporary here because it ends up being on a stack.
// E.g. it would copy from memory to register,
// from register to stack, from stack to register
// and from register to destination.

const {
assert!(XMM_BYTES == size_of::<u64>() * 2, "Check number of temporaries below.");
assert!(YMM_BYTES == size_of::<u64>() * 4, "Check number of temporaries below.");
}
let x: *mut MaybeUninit<u64> = x.cast();
let y: *mut MaybeUninit<u64> = y.cast();
// Use separate variables instead.
// They are successfully folded into YMM register when compiled with AVX,
// or pair of XMM registers in `swap_simd::<YMM_BYTES>` without AVX,
// or single XMM register in `swap_simd::<XMM_BYTES>`.
let x0: MaybeUninit<u64> = ptr::read_unaligned(x);
let x1: MaybeUninit<u64> = ptr::read_unaligned(x.add(1));
let x2: MaybeUninit<u64>;
let x3: MaybeUninit<u64>;
Comment on lines +890 to +893
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does 4xu64 + read_unaligned even make sense? llvm should lower [u8; 16] loads to movdqu too which means we don't need to worry about alignment.

Copy link
Contributor Author

@AngelicosPhosphoros AngelicosPhosphoros May 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, it does make a difference: https://godbolt.org/z/TMcYGzq8z
If I use arrays, it stores one of them on stack (accessed using rsp pointer), and copy and read data 3 times instead of 2.

if const { SIZE_BYTES == YMM_BYTES } {
x2 = ptr::read_unaligned(x.add(2));
x3 = ptr::read_unaligned(x.add(3));
} else {
x2 = MaybeUninit::uninit();
x3 = MaybeUninit::uninit();
}

// Unlike simple swap, we need to use direct move here
// instead of using temporary value for `y` like in `swap_simple`
// because it causes temporaries for `x` to be copied to stack.

// Cast to `MaybeUninit<u8>` because `copy_nonoverlapping` requires correct alignment.
ptr::copy_nonoverlapping::<MaybeUninit<u8>>(y.cast(), x.cast(), SIZE_BYTES);

ptr::write_unaligned(y, x0);
ptr::write_unaligned(y.add(1), x1);
if const { SIZE_BYTES == YMM_BYTES } {
ptr::write_unaligned(y.add(2), x2);
ptr::write_unaligned(y.add(3), x3);
}
}
}

/// Would swap first `size_of::<ChunkTy>` bytes of tail.
/// SAFETY:
/// `x` and `y` must not overlap.
/// `x` and `y` must have at least `size_of::<ChunkTy>` bytes.
#[inline]
const unsafe fn swap_tail<ChunkTy: Copy>(x: *mut MaybeUninit<u8>, y: *mut MaybeUninit<u8>) {
// SAFETY: Caller must ensure pointers validity.
// We use unaligned reads/writes.
unsafe {
// Same rationale for 2 reads and 2 writes
// as in `swap_simple`.
let tmp_x: MaybeUninit<ChunkTy> = ptr::read_unaligned(x.cast());
let tmp_y: MaybeUninit<ChunkTy> = ptr::read_unaligned(y.cast());
ptr::write_unaligned(x.cast(), tmp_y);
ptr::write_unaligned(y.cast(), tmp_x);
}
}

const {
assert!(size_of::<T>() <= usize::MAX / 4, "We assume that overflows cannot happen.");
}

let x: *mut MaybeUninit<u8> = (x as *mut T).cast();
let y: *mut MaybeUninit<u8> = (y as *mut T).cast();

// I would like to add detection for available SIMD here
// but since standard library is distributed precompiled,
// `cfg!(target_feature="xxx")` evaluates to false here
// even if final binary is built with those features.

let size = const { size_of::<T>() };
let mut byte_offset = 0;

// This part would autovectorize to use biggest SIMD register available.
// SAFETY: pointers are valid because they are from references,
// `limit` <= `size`, we removed remainder.
// Whole function doesn't contain places which can panic
// so function wouldn't interrepted before swapping ends.
unsafe {
let exactly_divisible = const {
let size = size_of::<T>();
size - size % ZMM_BYTES
};
swap_simple_u64_chunks(exactly_divisible, x, y);
byte_offset += exactly_divisible;
}
if byte_offset + YMM_BYTES <= size {
// SAFETY: Pointers don't overlap because mutable references don't overlap.
// We just checked range.
// Whole function doesn't contain places which can panic
// so function wouldn't interrepted before swapping ends.
unsafe {
// We need to do this only once because tail after ZMM_BYTES chunks cannot contain more than 1.
// It is OK to do this even if AVX not enabled because it would just use 4 XMM registers.
force_swap_simd::<YMM_BYTES>(x.add(byte_offset), y.add(byte_offset));
byte_offset += YMM_BYTES;
}
}
if byte_offset + XMM_BYTES <= size {
// SAFETY: Pointers don't overlap because mutable references don't overlap.
// We just checked range.
// Whole function doesn't contain places which can panic
// so function wouldn't interrepted before swapping ends.
unsafe {
// We need to do this only once because tail after YMM_BYTES chunks cannot contain more than 1.
force_swap_simd::<XMM_BYTES>(x.add(byte_offset), y.add(byte_offset));
byte_offset += XMM_BYTES;
}
}

macro_rules! swap_tail_by {
($t:ty) => {
// SAFETY: Pointers don't overlap because mutable references don't overlap.
// We never access pointers in a way that require alignment,
// and whole function doesn't contain places which can panic
// so it is drop safe.
// We swapped first `size_of::<T>() / 16 * 16` bytes already.
// We try `swap_tail` functions in order from bigger to smaller.
unsafe {
if byte_offset + const { size_of::<$t>() } <= size {
swap_tail::<$t>(x.add(byte_offset), y.add(byte_offset));
byte_offset += const { size_of::<$t>() };
}
}
};
}
swap_tail_by!(u64);
swap_tail_by!(u32);
swap_tail_by!(u16);
// Swapping by `u8` is guaranteed to finish all bytes
// because it has `size_of == 1`.
swap_tail_by!(u8);
let _ = byte_offset;
}

// For platforms wit slow unaligned accesses
// we can just delegate to pointer swaps.
#[cfg(not(target_arch = "x86_64"))]
#[rustc_const_unstable(feature = "const_swap", issue = "83163")]
#[inline]
pub(crate) const fn swap_chunked<T: Sized>(x: &mut T, y: &mut T) {
// SAFETY: exclusive references always point to one non-overlapping
// element and are non-null and properly aligned.
unsafe {
ptr::swap_nonoverlapping(x, y, 1);
}
}

/// Replaces `dest` with the default value of `T`, returning the previous `dest` value.
///
/// * If you want to replace the values of two variables, see [`swap`].
Expand Down
2 changes: 2 additions & 0 deletions library/core/src/ptr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -942,6 +942,8 @@ pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
{
attempt_swap_as_chunks!(usize);
attempt_swap_as_chunks!(u8);
// Should not happen because anything must be splittable by u8.
unreachable!();
}

// SAFETY: Same preconditions as this function
Expand Down
57 changes: 44 additions & 13 deletions tests/codegen/swap-small-types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,27 @@ type RGB48 = [u16; 3];
// CHECK-LABEL: @swap_rgb48
#[no_mangle]
pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
// FIXME MIR inlining messes up LLVM optimizations.
// WOULD-CHECK-NOT: alloca
// WOULD-CHECK: load i48
// WOULD-CHECK: store i48
// CHECK-NOT: alloca
// Those can be supported only by backend
// WOULD-CHECK: load i48
// WOULD-CHECK: store i48
// CHECK: ret void
swap(x, y)
}

// CHECK-LABEL: @swap_vecs
#[no_mangle]
pub fn swap_vecs(x: &mut Vec<u32>, y: &mut Vec<u32>) {
// CHECK-NOT: alloca
// CHECK: ret void
swap(x, y)
}

// CHECK-LABEL: @swap_slices
#[no_mangle]
pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) {
// CHECK-NOT: alloca
// CHECK: ret void
swap(x, y)
}

Expand All @@ -25,9 +42,9 @@ type RGB24 = [u8; 3];
// CHECK-LABEL: @swap_rgb24_slices
#[no_mangle]
pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) {
// CHECK-NOT: alloca
// CHECK: load <{{[0-9]+}} x i8>
// CHECK: store <{{[0-9]+}} x i8>
// CHECK-NOT: alloca
// CHECK: load <{{[0-9]+}} x i8>
// CHECK: store <{{[0-9]+}} x i8>
if x.len() == y.len() {
x.swap_with_slice(y);
}
Expand All @@ -39,9 +56,9 @@ type RGBA32 = [u8; 4];
// CHECK-LABEL: @swap_rgba32_slices
#[no_mangle]
pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) {
// CHECK-NOT: alloca
// CHECK: load <{{[0-9]+}} x i32>
// CHECK: store <{{[0-9]+}} x i32>
// CHECK-NOT: alloca
// CHECK: load <{{[0-9]+}} x i32>
// CHECK: store <{{[0-9]+}} x i32>
if x.len() == y.len() {
x.swap_with_slice(y);
}
Expand All @@ -54,10 +71,24 @@ const _: () = assert!(!std::mem::size_of::<String>().is_power_of_two());
// CHECK-LABEL: @swap_string_slices
#[no_mangle]
pub fn swap_string_slices(x: &mut [String], y: &mut [String]) {
// CHECK-NOT: alloca
// CHECK: load <{{[0-9]+}} x i64>
// CHECK: store <{{[0-9]+}} x i64>
// CHECK-NOT: alloca
// CHECK: load <{{[0-9]+}} x i64>
// CHECK: store <{{[0-9]+}} x i64>
if x.len() == y.len() {
x.swap_with_slice(y);
}
}

#[repr(C, packed)]
pub struct Packed {
pub first: bool,
pub second: usize,
}

// CHECK-LABEL: @swap_packed_structs
#[no_mangle]
pub fn swap_packed_structs(x: &mut Packed, y: &mut Packed) {
// CHECK-NOT: alloca
// CHECK: ret void
swap(x, y)
}