From e49be415cdc24d876403e8493e28ec1bfee62427 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Tue, 16 Apr 2024 20:24:21 +0200 Subject: [PATCH 01/13] Replace sort implementations - `slice::sort` -> driftsort https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md - `slice::sort_unstable` -> ipnsort https://github.com/Voultapher/sort-research-rs/blob/main/writeup/ipnsort_introduction/text.md Replaces the sort implementations with tailor made ones that strike a balance of run-time, compile-time and binary-size, yielding run-time and compile-time improvements. Regressing binary-size for `slice::sort` while improving it for `slice::sort_unstable`. All while upholding the existing soft and hard safety guarantees, and even extending the soft guarantees, detecting strict weak ordering violations with a high chance and reporting it to users via a panic. In addition the implementation of `select_nth_unstable` is also adapted as it uses `slice::sort_unstable` internals. --- library/alloc/src/slice.rs | 209 ++- library/alloc/src/slice/tests.rs | 22 +- library/core/src/slice/mod.rs | 193 ++- library/core/src/slice/sort.rs | 1383 ----------------- library/core/src/slice/sort/mod.rs | 8 + library/core/src/slice/{ => sort}/select.rs | 148 +- library/core/src/slice/sort/shared/mod.rs | 45 + library/core/src/slice/sort/shared/pivot.rs | 88 ++ .../core/src/slice/sort/shared/smallsort.rs | 843 ++++++++++ library/core/src/slice/sort/stable/drift.rs | 300 ++++ library/core/src/slice/sort/stable/merge.rs | 151 ++ library/core/src/slice/sort/stable/mod.rs | 116 ++ .../core/src/slice/sort/stable/quicksort.rs | 267 ++++ .../core/src/slice/sort/unstable/heapsort.rs | 80 + library/core/src/slice/sort/unstable/mod.rs | 76 + .../core/src/slice/sort/unstable/quicksort.rs | 347 +++++ library/core/tests/lib.rs | 1 - library/core/tests/slice.rs | 26 +- 18 files changed, 2618 insertions(+), 1685 deletions(-) delete mode 100644 library/core/src/slice/sort.rs create mode 100644 library/core/src/slice/sort/mod.rs rename library/core/src/slice/{ => sort}/select.rs (77%) create mode 100644 library/core/src/slice/sort/shared/mod.rs create mode 100644 library/core/src/slice/sort/shared/pivot.rs create mode 100644 library/core/src/slice/sort/shared/smallsort.rs create mode 100644 library/core/src/slice/sort/stable/drift.rs create mode 100644 library/core/src/slice/sort/stable/merge.rs create mode 100644 library/core/src/slice/sort/stable/mod.rs create mode 100644 library/core/src/slice/sort/stable/quicksort.rs create mode 100644 library/core/src/slice/sort/unstable/heapsort.rs create mode 100644 library/core/src/slice/sort/unstable/mod.rs create mode 100644 library/core/src/slice/sort/unstable/quicksort.rs diff --git a/library/alloc/src/slice.rs b/library/alloc/src/slice.rs index ebe6f7e7caa9b..104f4ee753b16 100644 --- a/library/alloc/src/slice.rs +++ b/library/alloc/src/slice.rs @@ -16,7 +16,7 @@ use core::borrow::{Borrow, BorrowMut}; #[cfg(not(no_global_oom_handling))] use core::cmp::Ordering::{self, Less}; #[cfg(not(no_global_oom_handling))] -use core::mem::{self, SizedTypeProperties}; +use core::mem::{self, MaybeUninit}; #[cfg(not(no_global_oom_handling))] use core::ptr; #[cfg(not(no_global_oom_handling))] @@ -24,7 +24,7 @@ use core::slice::sort; use crate::alloc::Allocator; #[cfg(not(no_global_oom_handling))] -use crate::alloc::{self, Global}; +use crate::alloc::Global; #[cfg(not(no_global_oom_handling))] use crate::borrow::ToOwned; use crate::boxed::Box; @@ -174,23 +174,32 @@ pub(crate) mod hack { #[cfg(not(test))] impl [T] { - /// Sorts the slice. + /// Sorts the slice, preserving initial order of equal elements. /// - /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case. + /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) + /// worst-case. + /// + /// If `T: Ord` does not implement a total order the resulting order is unspecified. All + /// original elements will remain in the slice and any possible modifications via interior + /// mutability are observed in the input. Same is true if `T: Ord` panics. /// /// When applicable, unstable sorting is preferred because it is generally faster than stable - /// sorting and it doesn't allocate auxiliary memory. - /// See [`sort_unstable`](slice::sort_unstable). + /// sorting and it doesn't allocate auxiliary memory. See + /// [`sort_unstable`](slice::sort_unstable). The exception are partially sorted slices, which + /// may be better served with `slice::sort`. /// /// # Current implementation /// - /// The current algorithm is an adaptive, iterative merge sort inspired by - /// [timsort](https://en.wikipedia.org/wiki/Timsort). - /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of - /// two or more sorted sequences concatenated one after another. + /// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which + /// combines the fast average case of quicksort with the fast worst case and partial run + /// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs + /// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*. + /// + /// The auxiliary memory allocation behavior depends on the input length. Short slices are + /// handled without allocation, medium sized slices allocate `self.len()` and beyond that it + /// clamps at `self.len() / 2`. /// - /// Also, it allocates temporary storage half the size of `self`, but for short slices a - /// non-allocating insertion sort is used instead. + /// If `T: Ord` does not implement a total order, the implementation may panic. /// /// # Examples /// @@ -200,6 +209,8 @@ impl [T] { /// v.sort(); /// assert!(v == [-5, -3, 1, 2, 4]); /// ``` + /// + /// [driftsort]: https://github.com/Voultapher/driftsort #[cfg(not(no_global_oom_handling))] #[rustc_allow_incoherent_impl] #[stable(feature = "rust1", since = "1.0.0")] @@ -211,13 +222,18 @@ impl [T] { stable_sort(self, T::lt); } - /// Sorts the slice with a comparator function. + /// Sorts the slice with a comparator function, preserving initial order of equal elements. + /// + /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) + /// worst-case. /// - /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case. + /// The comparator function should define a total ordering for the elements in the slice. If the + /// ordering is not total, the order of the elements is unspecified. /// - /// The comparator function must define a total ordering for the elements in the slice. If - /// the ordering is not total, the order of the elements is unspecified. An order is a - /// total order if it is (for all `a`, `b` and `c`): + /// If the comparator function does not implement a total order the resulting order is + /// unspecified. All original elements will remain in the slice and any possible modifications + /// via interior mutability are observed in the input. Same is true if the comparator function + /// panics. A total order (for all `a`, `b` and `c`): /// /// * total and antisymmetric: exactly one of `a < b`, `a == b` or `a > b` is true, and /// * transitive, `a < b` and `b < c` implies `a < c`. The same must hold for both `==` and `>`. @@ -227,23 +243,22 @@ impl [T] { /// /// ``` /// let mut floats = [5f64, 4.0, 1.0, 3.0, 2.0]; - /// floats.sort_by(|a, b| a.partial_cmp(b).unwrap()); + /// floats.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap()); /// assert_eq!(floats, [1.0, 2.0, 3.0, 4.0, 5.0]); /// ``` /// - /// When applicable, unstable sorting is preferred because it is generally faster than stable - /// sorting and it doesn't allocate auxiliary memory. - /// See [`sort_unstable_by`](slice::sort_unstable_by). - /// /// # Current implementation /// - /// The current algorithm is an adaptive, iterative merge sort inspired by - /// [timsort](https://en.wikipedia.org/wiki/Timsort). - /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of - /// two or more sorted sequences concatenated one after another. + /// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which + /// combines the fast average case of quicksort with the fast worst case and partial run + /// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs + /// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*. + /// + /// The auxiliary memory allocation behavior depends on the input length. Short slices are + /// handled without allocation, medium sized slices allocate `self.len()` and beyond that it + /// clamps at `self.len() / 2`. /// - /// Also, it allocates temporary storage half the size of `self`, but for short slices a - /// non-allocating insertion sort is used instead. + /// If `T: Ord` does not implement a total order, the implementation may panic. /// /// # Examples /// @@ -256,6 +271,8 @@ impl [T] { /// v.sort_by(|a, b| b.cmp(a)); /// assert!(v == [5, 4, 3, 2, 1]); /// ``` + /// + /// [driftsort]: https://github.com/Voultapher/driftsort #[cfg(not(no_global_oom_handling))] #[rustc_allow_incoherent_impl] #[stable(feature = "rust1", since = "1.0.0")] @@ -267,28 +284,27 @@ impl [T] { stable_sort(self, |a, b| compare(a, b) == Less); } - /// Sorts the slice with a key extraction function. + /// Sorts the slice with a key extraction function, preserving initial order of equal elements. /// /// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* \* log(*n*)) /// worst-case, where the key function is *O*(*m*). /// - /// For expensive key functions (e.g. functions that are not simple property accesses or - /// basic operations), [`sort_by_cached_key`](slice::sort_by_cached_key) is likely to be - /// significantly faster, as it does not recompute element keys. - /// - /// When applicable, unstable sorting is preferred because it is generally faster than stable - /// sorting and it doesn't allocate auxiliary memory. - /// See [`sort_unstable_by_key`](slice::sort_unstable_by_key). + /// If `K: Ord` does not implement a total order the resulting order is unspecified. + /// All original elements will remain in the slice and any possible modifications via interior + /// mutability are observed in the input. Same is true if `K: Ord` panics. /// /// # Current implementation /// - /// The current algorithm is an adaptive, iterative merge sort inspired by - /// [timsort](https://en.wikipedia.org/wiki/Timsort). - /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of - /// two or more sorted sequences concatenated one after another. + /// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which + /// combines the fast average case of quicksort with the fast worst case and partial run + /// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs + /// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*. + /// + /// The auxiliary memory allocation behavior depends on the input length. Short slices are + /// handled without allocation, medium sized slices allocate `self.len()` and beyond that it + /// clamps at `self.len() / 2`. /// - /// Also, it allocates temporary storage half the size of `self`, but for short slices a - /// non-allocating insertion sort is used instead. + /// If `K: Ord` does not implement a total order, the implementation may panic. /// /// # Examples /// @@ -298,6 +314,8 @@ impl [T] { /// v.sort_by_key(|k| k.abs()); /// assert!(v == [1, 2, -3, 4, -5]); /// ``` + /// + /// [driftsort]: https://github.com/Voultapher/driftsort #[cfg(not(no_global_oom_handling))] #[rustc_allow_incoherent_impl] #[stable(feature = "slice_sort_by_key", since = "1.7.0")] @@ -310,27 +328,30 @@ impl [T] { stable_sort(self, |a, b| f(a).lt(&f(b))); } - /// Sorts the slice with a key extraction function. + /// Sorts the slice with a key extraction function, preserving initial order of equal elements. /// - /// During sorting, the key function is called at most once per element, by using - /// temporary storage to remember the results of key evaluation. - /// The order of calls to the key function is unspecified and may change in future versions - /// of the standard library. + /// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* + *n* \* + /// log(*n*)) worst-case, where the key function is *O*(*m*). /// - /// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* + *n* \* log(*n*)) - /// worst-case, where the key function is *O*(*m*). + /// During sorting, the key function is called at most once per element, by using temporary + /// storage to remember the results of key evaluation. The order of calls to the key function is + /// unspecified and may change in future versions of the standard library. + /// + /// If `K: Ord` does not implement a total order the resulting order is unspecified. + /// All original elements will remain in the slice and any possible modifications via interior + /// mutability are observed in the input. Same is true if `K: Ord` panics. /// - /// For simple key functions (e.g., functions that are property accesses or - /// basic operations), [`sort_by_key`](slice::sort_by_key) is likely to be - /// faster. + /// For simple key functions (e.g., functions that are property accesses or basic operations), + /// [`sort_by_key`](slice::sort_by_key) is likely to be faster. /// /// # Current implementation /// - /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters, - /// which combines the fast average case of randomized quicksort with the fast worst case of - /// heapsort, while achieving linear time on slices with certain patterns. It uses some - /// randomization to avoid degenerate cases, but with a fixed seed to always provide - /// deterministic behavior. + /// The current implementation is based on [instruction-parallel-network sort][ipnsort] by Lukas + /// Bergdoll, which combines the fast average case of randomized quicksort with the fast worst + /// case of heapsort, while achieving linear time on fully sorted and reversed inputs. And + /// *O*(*k* \* log(*n*)) where *k* is the number of distinct elements in the input. It leverages + /// superscalar out-of-order execution capabilities commonly found in CPUs, to efficiently + /// perform the operation. /// /// In the worst case, the algorithm allocates temporary storage in a `Vec<(K, usize)>` the /// length of the slice. @@ -344,7 +365,7 @@ impl [T] { /// assert!(v == [-3, -5, 2, 32, 4]); /// ``` /// - /// [pdqsort]: https://github.com/orlp/pdqsort + /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort #[cfg(not(no_global_oom_handling))] #[rustc_allow_incoherent_impl] #[stable(feature = "slice_sort_by_cached_key", since = "1.34.0")] @@ -361,7 +382,7 @@ impl [T] { $slice.iter().map($f).enumerate().map(|(i, k)| (k, i as $t)).collect(); // The elements of `indices` are unique, as they are indexed, so any sort will be // stable with respect to the original slice. We use `sort_unstable` here because - // it requires less memory allocation. + // it requires no memory allocation. indices.sort_unstable(); for i in 0..$slice.len() { let mut index = indices[i].1; @@ -374,24 +395,24 @@ impl [T] { }}; } - let sz_u8 = mem::size_of::<(K, u8)>(); - let sz_u16 = mem::size_of::<(K, u16)>(); - let sz_u32 = mem::size_of::<(K, u32)>(); - let sz_usize = mem::size_of::<(K, usize)>(); - let len = self.len(); if len < 2 { return; } - if sz_u8 < sz_u16 && len <= (u8::MAX as usize) { - return sort_by_key!(u8, self, f); - } - if sz_u16 < sz_u32 && len <= (u16::MAX as usize) { - return sort_by_key!(u16, self, f); - } - if sz_u32 < sz_usize && len <= (u32::MAX as usize) { + + // Avoids binary-size usage in cases where the alignment doesn't work out to make this + // beneficial or on 32-bit platforms. + let is_using_u32_as_idx_type_helpful = + const { mem::size_of::<(K, u32)>() < mem::size_of::<(K, usize)>() }; + + // It's possible to instantiate this for u8 and u16 but, doing so is very wasteful in terms + // of compile-times and binary-size, the peak saved heap memory for u16 is (u8 + u16) -> 4 + // bytes * u16::MAX vs (u8 + u32) -> 8 bytes * u16::MAX, the saved heap memory is at peak + // ~262KB. + if is_using_u32_as_idx_type_helpful && len <= (u32::MAX as usize) { return sort_by_key!(u32, self, f); } + sort_by_key!(usize, self, f) } @@ -843,46 +864,18 @@ fn stable_sort(v: &mut [T], mut is_less: F) where F: FnMut(&T, &T) -> bool, { - if T::IS_ZST { - // Sorting has no meaningful behavior on zero-sized types. Do nothing. - return; - } + use sort::stable::BufGuard; - let elem_alloc_fn = |len: usize| -> *mut T { - // SAFETY: Creating the layout is safe as long as merge_sort never calls this with len > - // v.len(). Alloc in general will only be used as 'shadow-region' to store temporary swap - // elements. - unsafe { alloc::alloc(alloc::Layout::array::(len).unwrap_unchecked()) as *mut T } - }; - - let elem_dealloc_fn = |buf_ptr: *mut T, len: usize| { - // SAFETY: Creating the layout is safe as long as merge_sort never calls this with len > - // v.len(). The caller must ensure that buf_ptr was created by elem_alloc_fn with the same - // len. - unsafe { - alloc::dealloc(buf_ptr as *mut u8, alloc::Layout::array::(len).unwrap_unchecked()); - } - }; - - let run_alloc_fn = |len: usize| -> *mut sort::TimSortRun { - // SAFETY: Creating the layout is safe as long as merge_sort never calls this with an - // obscene length or 0. - unsafe { - alloc::alloc(alloc::Layout::array::(len).unwrap_unchecked()) - as *mut sort::TimSortRun + #[unstable(issue = "none", feature = "std_internals")] + impl BufGuard for Vec { + fn with_capacity(capacity: usize) -> Self { + Vec::with_capacity(capacity) } - }; - let run_dealloc_fn = |buf_ptr: *mut sort::TimSortRun, len: usize| { - // SAFETY: The caller must ensure that buf_ptr was created by elem_alloc_fn with the same - // len. - unsafe { - alloc::dealloc( - buf_ptr as *mut u8, - alloc::Layout::array::(len).unwrap_unchecked(), - ); + fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit] { + self.spare_capacity_mut() } - }; + } - sort::merge_sort(v, &mut is_less, elem_alloc_fn, elem_dealloc_fn, run_alloc_fn, run_dealloc_fn); + sort::stable::sort::>(v, &mut is_less); } diff --git a/library/alloc/src/slice/tests.rs b/library/alloc/src/slice/tests.rs index 54bc4e77b16f0..63a176a5ace38 100644 --- a/library/alloc/src/slice/tests.rs +++ b/library/alloc/src/slice/tests.rs @@ -34,7 +34,7 @@ macro_rules! do_test { } let v = $input.to_owned(); - let _ = std::panic::catch_unwind(move || { + let _ = panic::catch_unwind(move || { let mut v = v; let mut panic_countdown = panic_countdown; v.$func(|a, b| { @@ -197,8 +197,7 @@ fn panic_safe() { let mut rng = test_rng(); - // Miri is too slow (but still need to `chain` to make the types match) - let lens = if cfg!(miri) { (1..10).chain(0..0) } else { (1..20).chain(70..MAX_LEN) }; + let lens = if cfg!(miri) { (1..10).chain(30..36) } else { (1..20).chain(70..MAX_LEN) }; let moduli: &[u32] = if cfg!(miri) { &[5] } else { &[5, 20, 50] }; for len in lens { @@ -294,15 +293,20 @@ fn test_sort() { } } - // Sort using a completely random comparison function. - // This will reorder the elements *somehow*, but won't panic. - let mut v = [0; 500]; - for i in 0..v.len() { + const ORD_VIOLATION_MAX_LEN: usize = 500; + let mut v = [0; ORD_VIOLATION_MAX_LEN]; + for i in 0..ORD_VIOLATION_MAX_LEN { v[i] = i as i32; } - v.sort_by(|_, _| *[Less, Equal, Greater].choose(&mut rng).unwrap()); + + // Sort using a completely random comparison function. This will reorder the elements *somehow*, + // it may panic but the original elements must still be present. + let _ = panic::catch_unwind(move || { + v.sort_by(|_, _| *[Less, Equal, Greater].choose(&mut rng).unwrap()); + }); + v.sort(); - for i in 0..v.len() { + for i in 0..ORD_VIOLATION_MAX_LEN { assert_eq!(v[i], i as i32); } diff --git a/library/core/src/slice/mod.rs b/library/core/src/slice/mod.rs index 9bee50424b3d6..4e6070cdb1b57 100644 --- a/library/core/src/slice/mod.rs +++ b/library/core/src/slice/mod.rs @@ -39,7 +39,6 @@ pub(crate) mod index; mod iter; mod raw; mod rotate; -mod select; mod specialize; #[unstable(feature = "str_internals", issue = "none")] @@ -83,10 +82,6 @@ pub use raw::{from_mut, from_ref}; #[unstable(feature = "slice_from_ptr_range", issue = "89792")] pub use raw::{from_mut_ptr_range, from_ptr_range}; -// This function is public only because there is no other way to unit test heapsort. -#[unstable(feature = "sort_internals", reason = "internal to sort module", issue = "none")] -pub use sort::heapsort; - #[stable(feature = "slice_get_slice", since = "1.28.0")] pub use index::SliceIndex; @@ -2884,21 +2879,26 @@ impl [T] { self.binary_search_by(|k| f(k).cmp(b)) } - /// Sorts the slice, but might not preserve the order of equal elements. + /// Sorts the slice **without** preserving the initial order of equal elements. + /// + /// This sort is unstable (i.e., may reorder equal elements), in-place (i.e., does not + /// allocate), and *O*(*n* \* log(*n*)) worst-case. /// - /// This sort is unstable (i.e., may reorder equal elements), in-place - /// (i.e., does not allocate), and *O*(*n* \* log(*n*)) worst-case. + /// If `T: Ord` does not implement a total order the resulting order is unspecified. All + /// original elements will remain in the slice and any possible modifications via interior + /// mutability are observed in the input. Same is true if `T: Ord` panics. /// /// # Current implementation /// - /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters, - /// which combines the fast average case of randomized quicksort with the fast worst case of - /// heapsort, while achieving linear time on slices with certain patterns. It uses some - /// randomization to avoid degenerate cases, but with a fixed seed to always provide - /// deterministic behavior. + /// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which + /// combines the fast average case of quicksort with the fast worst case of heapsort, achieving + /// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the + /// expected time to sort the data is *O(*n* log(*k*))*. /// /// It is typically faster than stable sorting, except in a few special cases, e.g., when the - /// slice consists of several concatenated sorted sequences. + /// slice is partially sorted. + /// + /// If `T: Ord` does not implement a total order, the implementation may panic. /// /// # Examples /// @@ -2909,25 +2909,29 @@ impl [T] { /// assert!(v == [-5, -3, 1, 2, 4]); /// ``` /// - /// [pdqsort]: https://github.com/orlp/pdqsort + /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort #[stable(feature = "sort_unstable", since = "1.20.0")] #[inline] pub fn sort_unstable(&mut self) where T: Ord, { - sort::quicksort(self, T::lt); + sort::unstable::sort(self, &mut T::lt); } - /// Sorts the slice with a comparator function, but might not preserve the order of equal - /// elements. + /// Sorts the slice with a comparator function, **without** preserving the initial order of + /// equal elements. /// - /// This sort is unstable (i.e., may reorder equal elements), in-place - /// (i.e., does not allocate), and *O*(*n* \* log(*n*)) worst-case. + /// This sort is unstable (i.e., may reorder equal elements), in-place (i.e., does not + /// allocate), and *O*(*n* \* log(*n*)) worst-case. /// - /// The comparator function must define a total ordering for the elements in the slice. If - /// the ordering is not total, the order of the elements is unspecified. An order is a - /// total order if it is (for all `a`, `b` and `c`): + /// The comparator function should define a total ordering for the elements in the slice. If the + /// ordering is not total, the order of the elements is unspecified. + /// + /// If the comparator function does not implement a total order the resulting order is + /// unspecified. All original elements will remain in the slice and any possible modifications + /// via interior mutability are observed in the input. Same is true if the comparator function + /// panics. A total order (for all `a`, `b` and `c`): /// /// * total and antisymmetric: exactly one of `a < b`, `a == b` or `a > b` is true, and /// * transitive, `a < b` and `b < c` implies `a < c`. The same must hold for both `==` and `>`. @@ -2943,14 +2947,15 @@ impl [T] { /// /// # Current implementation /// - /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters, - /// which combines the fast average case of randomized quicksort with the fast worst case of - /// heapsort, while achieving linear time on slices with certain patterns. It uses some - /// randomization to avoid degenerate cases, but with a fixed seed to always provide - /// deterministic behavior. + /// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which + /// combines the fast average case of quicksort with the fast worst case of heapsort, achieving + /// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the + /// expected time to sort the data is *O(*n* log(*k*))*. /// /// It is typically faster than stable sorting, except in a few special cases, e.g., when the - /// slice consists of several concatenated sorted sequences. + /// slice is partially sorted. + /// + /// If `T: Ord` does not implement a total order, the implementation may panic. /// /// # Examples /// @@ -2964,34 +2969,37 @@ impl [T] { /// assert!(v == [5, 4, 3, 2, 1]); /// ``` /// - /// [pdqsort]: https://github.com/orlp/pdqsort + /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort #[stable(feature = "sort_unstable", since = "1.20.0")] #[inline] pub fn sort_unstable_by(&mut self, mut compare: F) where F: FnMut(&T, &T) -> Ordering, { - sort::quicksort(self, |a, b| compare(a, b) == Ordering::Less); + sort::unstable::sort(self, &mut |a, b| compare(a, b) == Ordering::Less); } - /// Sorts the slice with a key extraction function, but might not preserve the order of equal - /// elements. + /// Sorts the slice with a key extraction function, **without** preserving the initial order of + /// equal elements. /// - /// This sort is unstable (i.e., may reorder equal elements), in-place - /// (i.e., does not allocate), and *O*(*m* \* *n* \* log(*n*)) worst-case, where the key function is - /// *O*(*m*). + /// This sort is unstable (i.e., may reorder equal elements), in-place (i.e., does not + /// allocate), and *O*(*n* \* log(*n*)) worst-case. + /// + /// If `K: Ord` does not implement a total order the resulting order is unspecified. + /// All original elements will remain in the slice and any possible modifications via interior + /// mutability are observed in the input. Same is true if `K: Ord` panics. /// /// # Current implementation /// - /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters, - /// which combines the fast average case of randomized quicksort with the fast worst case of - /// heapsort, while achieving linear time on slices with certain patterns. It uses some - /// randomization to avoid degenerate cases, but with a fixed seed to always provide - /// deterministic behavior. + /// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which + /// combines the fast average case of quicksort with the fast worst case of heapsort, achieving + /// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the + /// expected time to sort the data is *O(*n* log(*k*))*. + /// + /// It is typically faster than stable sorting, except in a few special cases, e.g., when the + /// slice is partially sorted. /// - /// Due to its key calling strategy, [`sort_unstable_by_key`](#method.sort_unstable_by_key) - /// is likely to be slower than [`sort_by_cached_key`](#method.sort_by_cached_key) in - /// cases where the key function is expensive. + /// If `K: Ord` does not implement a total order, the implementation may panic. /// /// # Examples /// @@ -3002,7 +3010,7 @@ impl [T] { /// assert!(v == [1, 2, -3, 4, -5]); /// ``` /// - /// [pdqsort]: https://github.com/orlp/pdqsort + /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort #[stable(feature = "sort_unstable", since = "1.20.0")] #[inline] pub fn sort_unstable_by_key(&mut self, mut f: F) @@ -3010,27 +3018,32 @@ impl [T] { F: FnMut(&T) -> K, K: Ord, { - sort::quicksort(self, |a, b| f(a).lt(&f(b))); + sort::unstable::sort(self, &mut |a, b| f(a).lt(&f(b))); } - /// Reorder the slice such that the element at `index` after the reordering is at its final sorted position. + /// Reorder the slice such that the element at `index` after the reordering is at its final + /// sorted position. /// /// This reordering has the additional property that any value at position `i < index` will be /// less than or equal to any value at a position `j > index`. Additionally, this reordering is - /// unstable (i.e. any number of equal elements may end up at position `index`), in-place - /// (i.e. does not allocate), and runs in *O*(*n*) time. - /// This function is also known as "kth element" in other libraries. + /// unstable (i.e. any number of equal elements may end up at position `index`), in-place (i.e. + /// does not allocate), and runs in *O*(*n*) time. This function is also known as "kth element" + /// in other libraries. /// - /// It returns a triplet of the following from the reordered slice: - /// the subslice prior to `index`, the element at `index`, and the subslice after `index`; - /// accordingly, the values in those two subslices will respectively all be less-than-or-equal-to - /// and greater-than-or-equal-to the value of the element at `index`. + /// It returns a triplet of the following from the reordered slice: the subslice prior to + /// `index`, the element at `index`, and the subslice after `index`; accordingly, the values in + /// those two subslices will respectively all be less-than-or-equal-to and + /// greater-than-or-equal-to the value of the element at `index`. /// /// # Current implementation /// - /// The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also - /// the basis for [`sort_unstable`]. The fallback algorithm is Median of Medians using Tukey's Ninther for - /// pivot selection, which guarantees linear runtime for all inputs. + /// The current algorithm is an introselect implementation based on [ipnsort] by Lukas Bergdoll + /// and Orson Peters, which is also the basis for [`sort_unstable`]. The fallback algorithm is + /// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime + /// for all inputs. + /// + /// It is typically faster than sorting, except in a few special cases, e.g., when the slice is + /// nearly fully sorted, where [`slice::sort`] may be faster. /// /// [`sort_unstable`]: slice::sort_unstable /// @@ -3058,35 +3071,40 @@ impl [T] { /// v == [-3, -5, 1, 4, 2] || /// v == [-5, -3, 1, 4, 2]); /// ``` + /// + /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort #[stable(feature = "slice_select_nth_unstable", since = "1.49.0")] #[inline] pub fn select_nth_unstable(&mut self, index: usize) -> (&mut [T], &mut T, &mut [T]) where T: Ord, { - select::partition_at_index(self, index, T::lt) + sort::select::partition_at_index(self, index, T::lt) } - /// Reorder the slice with a comparator function such that the element at `index` after the reordering is at - /// its final sorted position. + /// Reorder the slice with a comparator function such that the element at `index` after the + /// reordering is at its final sorted position. /// /// This reordering has the additional property that any value at position `i < index` will be /// less than or equal to any value at a position `j > index` using the comparator function. /// Additionally, this reordering is unstable (i.e. any number of equal elements may end up at - /// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time. - /// This function is also known as "kth element" in other libraries. + /// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time. This + /// function is also known as "kth element" in other libraries. /// - /// It returns a triplet of the following from - /// the slice reordered according to the provided comparator function: the subslice prior to - /// `index`, the element at `index`, and the subslice after `index`; accordingly, the values in - /// those two subslices will respectively all be less-than-or-equal-to and greater-than-or-equal-to - /// the value of the element at `index`. + /// It returns a triplet of the following from the slice reordered according to the provided + /// comparator function: the subslice prior to `index`, the element at `index`, and the subslice + /// after `index`; accordingly, the values in those two subslices will respectively all be + /// less-than-or-equal-to and greater-than-or-equal-to the value of the element at `index`. /// /// # Current implementation /// - /// The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also - /// the basis for [`sort_unstable`]. The fallback algorithm is Median of Medians using Tukey's Ninther for - /// pivot selection, which guarantees linear runtime for all inputs. + /// The current algorithm is an introselect implementation based on [ipnsort] by Lukas Bergdoll + /// and Orson Peters, which is also the basis for [`sort_unstable`]. The fallback algorithm is + /// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime + /// for all inputs. + /// + /// It is typically faster than sorting, except in a few special cases, e.g., when the slice is + /// nearly fully sorted, where [`slice::sort`] may be faster. /// /// [`sort_unstable`]: slice::sort_unstable /// @@ -3114,6 +3132,8 @@ impl [T] { /// v == [4, 2, 1, -5, -3] || /// v == [4, 2, 1, -3, -5]); /// ``` + /// + /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort #[stable(feature = "slice_select_nth_unstable", since = "1.49.0")] #[inline] pub fn select_nth_unstable_by( @@ -3124,29 +3144,32 @@ impl [T] { where F: FnMut(&T, &T) -> Ordering, { - select::partition_at_index(self, index, |a: &T, b: &T| compare(a, b) == Less) + sort::select::partition_at_index(self, index, |a: &T, b: &T| compare(a, b) == Less) } - /// Reorder the slice with a key extraction function such that the element at `index` after the reordering is - /// at its final sorted position. + /// Reorder the slice with a key extraction function such that the element at `index` after the + /// reordering is at its final sorted position. /// /// This reordering has the additional property that any value at position `i < index` will be /// less than or equal to any value at a position `j > index` using the key extraction function. /// Additionally, this reordering is unstable (i.e. any number of equal elements may end up at - /// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time. - /// This function is also known as "kth element" in other libraries. + /// position `index`), in-place (i.e. does not allocate), and runs in *O*(*n*) time. This + /// function is also known as "kth element" in other libraries. /// - /// It returns a triplet of the following from - /// the slice reordered according to the provided key extraction function: the subslice prior to - /// `index`, the element at `index`, and the subslice after `index`; accordingly, the values in - /// those two subslices will respectively all be less-than-or-equal-to and greater-than-or-equal-to - /// the value of the element at `index`. + /// It returns a triplet of the following from the slice reordered according to the provided key + /// extraction function: the subslice prior to `index`, the element at `index`, and the subslice + /// after `index`; accordingly, the values in those two subslices will respectively all be + /// less-than-or-equal-to and greater-than-or-equal-to the value of the element at `index`. /// /// # Current implementation /// - /// The current algorithm is an introselect implementation based on Pattern Defeating Quicksort, which is also - /// the basis for [`sort_unstable`]. The fallback algorithm is Median of Medians using Tukey's Ninther for - /// pivot selection, which guarantees linear runtime for all inputs. + /// The current algorithm is an introselect implementation based on [ipnsort] by Lukas Bergdoll + /// and Orson Peters, which is also the basis for [`sort_unstable`]. The fallback algorithm is + /// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime + /// for all inputs. + /// + /// It is typically faster than sorting, except in a few special cases, e.g., when the slice is + /// nearly fully sorted, where [`slice::sort`] may be faster. /// /// [`sort_unstable`]: slice::sort_unstable /// @@ -3174,6 +3197,8 @@ impl [T] { /// v == [2, 1, -3, 4, -5] || /// v == [2, 1, -3, -5, 4]); /// ``` + /// + /// [ipnsort]: https://github.com/Voultapher/sort-research-rs/tree/main/ipnsort #[stable(feature = "slice_select_nth_unstable", since = "1.49.0")] #[inline] pub fn select_nth_unstable_by_key( @@ -3185,7 +3210,7 @@ impl [T] { F: FnMut(&T) -> K, K: Ord, { - select::partition_at_index(self, index, |a: &T, b: &T| f(a).lt(&f(b))) + sort::select::partition_at_index(self, index, |a: &T, b: &T| f(a).lt(&f(b))) } /// Moves all consecutive repeated elements to the end of the slice according to the diff --git a/library/core/src/slice/sort.rs b/library/core/src/slice/sort.rs deleted file mode 100644 index 993a608f42b60..0000000000000 --- a/library/core/src/slice/sort.rs +++ /dev/null @@ -1,1383 +0,0 @@ -//! Slice sorting -//! -//! This module contains a sorting algorithm based on Orson Peters' pattern-defeating quicksort, -//! published at: -//! -//! Unstable sorting is compatible with core because it doesn't allocate memory, unlike our -//! stable sorting implementation. -//! -//! In addition it also contains the core logic of the stable sort used by `slice::sort` based on -//! TimSort. - -use crate::cmp; -use crate::mem::{self, MaybeUninit, SizedTypeProperties}; -use crate::ptr; - -// When dropped, copies from `src` into `dest`. -struct InsertionHole { - src: *const T, - dest: *mut T, -} - -impl Drop for InsertionHole { - fn drop(&mut self) { - // SAFETY: This is a helper class. Please refer to its usage for correctness. Namely, one - // must be sure that `src` and `dst` does not overlap as required by - // `ptr::copy_nonoverlapping` and are both valid for writes. - unsafe { - ptr::copy_nonoverlapping(self.src, self.dest, 1); - } - } -} - -/// Inserts `v[v.len() - 1]` into pre-sorted sequence `v[..v.len() - 1]` so that whole `v[..]` -/// becomes sorted. -unsafe fn insert_tail(v: &mut [T], is_less: &mut F) -where - F: FnMut(&T, &T) -> bool, -{ - debug_assert!(v.len() >= 2); - - let arr_ptr = v.as_mut_ptr(); - let i = v.len() - 1; - - // SAFETY: caller must ensure v is at least len 2. - unsafe { - // See insert_head which talks about why this approach is beneficial. - let i_ptr = arr_ptr.add(i); - - // It's important that we use i_ptr here. If this check is positive and we continue, - // We want to make sure that no other copy of the value was seen by is_less. - // Otherwise we would have to copy it back. - if is_less(&*i_ptr, &*i_ptr.sub(1)) { - // It's important, that we use tmp for comparison from now on. As it is the value that - // will be copied back. And notionally we could have created a divergence if we copy - // back the wrong value. - let tmp = mem::ManuallyDrop::new(ptr::read(i_ptr)); - // Intermediate state of the insertion process is always tracked by `hole`, which - // serves two purposes: - // 1. Protects integrity of `v` from panics in `is_less`. - // 2. Fills the remaining hole in `v` in the end. - // - // Panic safety: - // - // If `is_less` panics at any point during the process, `hole` will get dropped and - // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it - // initially held exactly once. - let mut hole = InsertionHole { src: &*tmp, dest: i_ptr.sub(1) }; - ptr::copy_nonoverlapping(hole.dest, i_ptr, 1); - - // SAFETY: We know i is at least 1. - for j in (0..(i - 1)).rev() { - let j_ptr = arr_ptr.add(j); - if !is_less(&*tmp, &*j_ptr) { - break; - } - - ptr::copy_nonoverlapping(j_ptr, hole.dest, 1); - hole.dest = j_ptr; - } - // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`. - } - } -} - -/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted. -/// -/// This is the integral subroutine of insertion sort. -unsafe fn insert_head(v: &mut [T], is_less: &mut F) -where - F: FnMut(&T, &T) -> bool, -{ - debug_assert!(v.len() >= 2); - - // SAFETY: caller must ensure v is at least len 2. - unsafe { - if is_less(v.get_unchecked(1), v.get_unchecked(0)) { - let arr_ptr = v.as_mut_ptr(); - - // There are three ways to implement insertion here: - // - // 1. Swap adjacent elements until the first one gets to its final destination. - // However, this way we copy data around more than is necessary. If elements are big - // structures (costly to copy), this method will be slow. - // - // 2. Iterate until the right place for the first element is found. Then shift the - // elements succeeding it to make room for it and finally place it into the - // remaining hole. This is a good method. - // - // 3. Copy the first element into a temporary variable. Iterate until the right place - // for it is found. As we go along, copy every traversed element into the slot - // preceding it. Finally, copy data from the temporary variable into the remaining - // hole. This method is very good. Benchmarks demonstrated slightly better - // performance than with the 2nd method. - // - // All methods were benchmarked, and the 3rd showed best results. So we chose that one. - let tmp = mem::ManuallyDrop::new(ptr::read(arr_ptr)); - - // Intermediate state of the insertion process is always tracked by `hole`, which - // serves two purposes: - // 1. Protects integrity of `v` from panics in `is_less`. - // 2. Fills the remaining hole in `v` in the end. - // - // Panic safety: - // - // If `is_less` panics at any point during the process, `hole` will get dropped and - // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it - // initially held exactly once. - let mut hole = InsertionHole { src: &*tmp, dest: arr_ptr.add(1) }; - ptr::copy_nonoverlapping(arr_ptr.add(1), arr_ptr.add(0), 1); - - for i in 2..v.len() { - if !is_less(&v.get_unchecked(i), &*tmp) { - break; - } - ptr::copy_nonoverlapping(arr_ptr.add(i), arr_ptr.add(i - 1), 1); - hole.dest = arr_ptr.add(i); - } - // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`. - } - } -} - -/// Sort `v` assuming `v[..offset]` is already sorted. -/// -/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no -/// performance impact. Even improving performance in some cases. -#[inline(never)] -pub(super) fn insertion_sort_shift_left(v: &mut [T], offset: usize, is_less: &mut F) -where - F: FnMut(&T, &T) -> bool, -{ - let len = v.len(); - - // Using assert here improves performance. - assert!(offset != 0 && offset <= len); - - // Shift each element of the unsorted region v[i..] as far left as is needed to make v sorted. - for i in offset..len { - // SAFETY: we tested that `offset` must be at least 1, so this loop is only entered if len - // >= 2. The range is exclusive and we know `i` must be at least 1 so this slice has at - // >least len 2. - unsafe { - insert_tail(&mut v[..=i], is_less); - } - } -} - -/// Sort `v` assuming `v[offset..]` is already sorted. -/// -/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no -/// performance impact. Even improving performance in some cases. -#[inline(never)] -fn insertion_sort_shift_right(v: &mut [T], offset: usize, is_less: &mut F) -where - F: FnMut(&T, &T) -> bool, -{ - let len = v.len(); - - // Using assert here improves performance. - assert!(offset != 0 && offset <= len && len >= 2); - - // Shift each element of the unsorted region v[..i] as far left as is needed to make v sorted. - for i in (0..offset).rev() { - // SAFETY: we tested that `offset` must be at least 1, so this loop is only entered if len - // >= 2.We ensured that the slice length is always at least 2 long. We know that start_found - // will be at least one less than end, and the range is exclusive. Which gives us i always - // <= (end - 2). - unsafe { - insert_head(&mut v[i..len], is_less); - } - } -} - -/// Partially sorts a slice by shifting several out-of-order elements around. -/// -/// Returns `true` if the slice is sorted at the end. This function is *O*(*n*) worst-case. -#[cold] -fn partial_insertion_sort(v: &mut [T], is_less: &mut F) -> bool -where - F: FnMut(&T, &T) -> bool, -{ - // Maximum number of adjacent out-of-order pairs that will get shifted. - const MAX_STEPS: usize = 5; - // If the slice is shorter than this, don't shift any elements. - const SHORTEST_SHIFTING: usize = 50; - - let len = v.len(); - let mut i = 1; - - for _ in 0..MAX_STEPS { - // SAFETY: We already explicitly did the bound checking with `i < len`. - // All our subsequent indexing is only in the range `0 <= index < len` - unsafe { - // Find the next pair of adjacent out-of-order elements. - while i < len && !is_less(v.get_unchecked(i), v.get_unchecked(i - 1)) { - i += 1; - } - } - - // Are we done? - if i == len { - return true; - } - - // Don't shift elements on short arrays, that has a performance cost. - if len < SHORTEST_SHIFTING { - return false; - } - - // Swap the found pair of elements. This puts them in correct order. - v.swap(i - 1, i); - - if i >= 2 { - // Shift the smaller element to the left. - insertion_sort_shift_left(&mut v[..i], i - 1, is_less); - - // Shift the greater element to the right. - insertion_sort_shift_right(&mut v[..i], 1, is_less); - } - } - - // Didn't manage to sort the slice in the limited number of steps. - false -} - -/// Sorts `v` using heapsort, which guarantees *O*(*n* \* log(*n*)) worst-case. -#[cold] -#[unstable(feature = "sort_internals", reason = "internal to sort module", issue = "none")] -pub fn heapsort(v: &mut [T], mut is_less: F) -where - F: FnMut(&T, &T) -> bool, -{ - // This binary heap respects the invariant `parent >= child`. - let mut sift_down = |v: &mut [T], mut node| { - loop { - // Children of `node`. - let mut child = 2 * node + 1; - if child >= v.len() { - break; - } - - // Choose the greater child. - if child + 1 < v.len() { - // We need a branch to be sure not to out-of-bounds index, - // but it's highly predictable. The comparison, however, - // is better done branchless, especially for primitives. - child += is_less(&v[child], &v[child + 1]) as usize; - } - - // Stop if the invariant holds at `node`. - if !is_less(&v[node], &v[child]) { - break; - } - - // Swap `node` with the greater child, move one step down, and continue sifting. - v.swap(node, child); - node = child; - } - }; - - // Build the heap in linear time. - for i in (0..v.len() / 2).rev() { - sift_down(v, i); - } - - // Pop maximal elements from the heap. - for i in (1..v.len()).rev() { - v.swap(0, i); - sift_down(&mut v[..i], 0); - } -} - -/// Partitions `v` into elements smaller than `pivot`, followed by elements greater than or equal -/// to `pivot`. -/// -/// Returns the number of elements smaller than `pivot`. -/// -/// Partitioning is performed block-by-block in order to minimize the cost of branching operations. -/// This idea is presented in the [BlockQuicksort][pdf] paper. -/// -/// [pdf]: https://drops.dagstuhl.de/opus/volltexte/2016/6389/pdf/LIPIcs-ESA-2016-38.pdf -fn partition_in_blocks(v: &mut [T], pivot: &T, is_less: &mut F) -> usize -where - F: FnMut(&T, &T) -> bool, -{ - // Number of elements in a typical block. - const BLOCK: usize = 128; - - // The partitioning algorithm repeats the following steps until completion: - // - // 1. Trace a block from the left side to identify elements greater than or equal to the pivot. - // 2. Trace a block from the right side to identify elements smaller than the pivot. - // 3. Exchange the identified elements between the left and right side. - // - // We keep the following variables for a block of elements: - // - // 1. `block` - Number of elements in the block. - // 2. `start` - Start pointer into the `offsets` array. - // 3. `end` - End pointer into the `offsets` array. - // 4. `offsets` - Indices of out-of-order elements within the block. - - // The current block on the left side (from `l` to `l.add(block_l)`). - let mut l = v.as_mut_ptr(); - let mut block_l = BLOCK; - let mut start_l = ptr::null_mut(); - let mut end_l = ptr::null_mut(); - let mut offsets_l = [MaybeUninit::::uninit(); BLOCK]; - - // The current block on the right side (from `r.sub(block_r)` to `r`). - // SAFETY: The documentation for .add() specifically mention that `vec.as_ptr().add(vec.len())` is always safe - let mut r = unsafe { l.add(v.len()) }; - let mut block_r = BLOCK; - let mut start_r = ptr::null_mut(); - let mut end_r = ptr::null_mut(); - let mut offsets_r = [MaybeUninit::::uninit(); BLOCK]; - - // FIXME: When we get VLAs, try creating one array of length `min(v.len(), 2 * BLOCK)` rather - // than two fixed-size arrays of length `BLOCK`. VLAs might be more cache-efficient. - - // Returns the number of elements between pointers `l` (inclusive) and `r` (exclusive). - fn width(l: *mut T, r: *mut T) -> usize { - assert!(mem::size_of::() > 0); - // FIXME: this should *likely* use `offset_from`, but more - // investigation is needed (including running tests in miri). - (r.addr() - l.addr()) / mem::size_of::() - } - - loop { - // We are done with partitioning block-by-block when `l` and `r` get very close. Then we do - // some patch-up work in order to partition the remaining elements in between. - let is_done = width(l, r) <= 2 * BLOCK; - - if is_done { - // Number of remaining elements (still not compared to the pivot). - let mut rem = width(l, r); - if start_l < end_l || start_r < end_r { - rem -= BLOCK; - } - - // Adjust block sizes so that the left and right block don't overlap, but get perfectly - // aligned to cover the whole remaining gap. - if start_l < end_l { - block_r = rem; - } else if start_r < end_r { - block_l = rem; - } else { - // There were the same number of elements to switch on both blocks during the last - // iteration, so there are no remaining elements on either block. Cover the remaining - // items with roughly equally-sized blocks. - block_l = rem / 2; - block_r = rem - block_l; - } - debug_assert!(block_l <= BLOCK && block_r <= BLOCK); - debug_assert!(width(l, r) == block_l + block_r); - } - - if start_l == end_l { - // Trace `block_l` elements from the left side. - start_l = MaybeUninit::slice_as_mut_ptr(&mut offsets_l); - end_l = start_l; - let mut elem = l; - - for i in 0..block_l { - // SAFETY: The unsafety operations below involve the usage of the `offset`. - // According to the conditions required by the function, we satisfy them because: - // 1. `offsets_l` is stack-allocated, and thus considered separate allocated object. - // 2. The function `is_less` returns a `bool`. - // Casting a `bool` will never overflow `isize`. - // 3. We have guaranteed that `block_l` will be `<= BLOCK`. - // Plus, `end_l` was initially set to the begin pointer of `offsets_` which was declared on the stack. - // Thus, we know that even in the worst case (all invocations of `is_less` returns false) we will only be at most 1 byte pass the end. - // Another unsafety operation here is dereferencing `elem`. - // However, `elem` was initially the begin pointer to the slice which is always valid. - unsafe { - // Branchless comparison. - *end_l = i as u8; - end_l = end_l.add(!is_less(&*elem, pivot) as usize); - elem = elem.add(1); - } - } - } - - if start_r == end_r { - // Trace `block_r` elements from the right side. - start_r = MaybeUninit::slice_as_mut_ptr(&mut offsets_r); - end_r = start_r; - let mut elem = r; - - for i in 0..block_r { - // SAFETY: The unsafety operations below involve the usage of the `offset`. - // According to the conditions required by the function, we satisfy them because: - // 1. `offsets_r` is stack-allocated, and thus considered separate allocated object. - // 2. The function `is_less` returns a `bool`. - // Casting a `bool` will never overflow `isize`. - // 3. We have guaranteed that `block_r` will be `<= BLOCK`. - // Plus, `end_r` was initially set to the begin pointer of `offsets_` which was declared on the stack. - // Thus, we know that even in the worst case (all invocations of `is_less` returns true) we will only be at most 1 byte pass the end. - // Another unsafety operation here is dereferencing `elem`. - // However, `elem` was initially `1 * sizeof(T)` past the end and we decrement it by `1 * sizeof(T)` before accessing it. - // Plus, `block_r` was asserted to be less than `BLOCK` and `elem` will therefore at most be pointing to the beginning of the slice. - unsafe { - // Branchless comparison. - elem = elem.sub(1); - *end_r = i as u8; - end_r = end_r.add(is_less(&*elem, pivot) as usize); - } - } - } - - // Number of out-of-order elements to swap between the left and right side. - let count = cmp::min(width(start_l, end_l), width(start_r, end_r)); - - if count > 0 { - macro_rules! left { - () => { - l.add(usize::from(*start_l)) - }; - } - macro_rules! right { - () => { - r.sub(usize::from(*start_r) + 1) - }; - } - - // Instead of swapping one pair at the time, it is more efficient to perform a cyclic - // permutation. This is not strictly equivalent to swapping, but produces a similar - // result using fewer memory operations. - - // SAFETY: The use of `ptr::read` is valid because there is at least one element in - // both `offsets_l` and `offsets_r`, so `left!` is a valid pointer to read from. - // - // The uses of `left!` involve calls to `offset` on `l`, which points to the - // beginning of `v`. All the offsets pointed-to by `start_l` are at most `block_l`, so - // these `offset` calls are safe as all reads are within the block. The same argument - // applies for the uses of `right!`. - // - // The calls to `start_l.offset` are valid because there are at most `count-1` of them, - // plus the final one at the end of the unsafe block, where `count` is the minimum number - // of collected offsets in `offsets_l` and `offsets_r`, so there is no risk of there not - // being enough elements. The same reasoning applies to the calls to `start_r.offset`. - // - // The calls to `copy_nonoverlapping` are safe because `left!` and `right!` are guaranteed - // not to overlap, and are valid because of the reasoning above. - unsafe { - let tmp = ptr::read(left!()); - ptr::copy_nonoverlapping(right!(), left!(), 1); - - for _ in 1..count { - start_l = start_l.add(1); - ptr::copy_nonoverlapping(left!(), right!(), 1); - start_r = start_r.add(1); - ptr::copy_nonoverlapping(right!(), left!(), 1); - } - - ptr::copy_nonoverlapping(&tmp, right!(), 1); - mem::forget(tmp); - start_l = start_l.add(1); - start_r = start_r.add(1); - } - } - - if start_l == end_l { - // All out-of-order elements in the left block were moved. Move to the next block. - - // block-width-guarantee - // SAFETY: if `!is_done` then the slice width is guaranteed to be at least `2*BLOCK` wide. There - // are at most `BLOCK` elements in `offsets_l` because of its size, so the `offset` operation is - // safe. Otherwise, the debug assertions in the `is_done` case guarantee that - // `width(l, r) == block_l + block_r`, namely, that the block sizes have been adjusted to account - // for the smaller number of remaining elements. - l = unsafe { l.add(block_l) }; - } - - if start_r == end_r { - // All out-of-order elements in the right block were moved. Move to the previous block. - - // SAFETY: Same argument as [block-width-guarantee]. Either this is a full block `2*BLOCK`-wide, - // or `block_r` has been adjusted for the last handful of elements. - r = unsafe { r.sub(block_r) }; - } - - if is_done { - break; - } - } - - // All that remains now is at most one block (either the left or the right) with out-of-order - // elements that need to be moved. Such remaining elements can be simply shifted to the end - // within their block. - - if start_l < end_l { - // The left block remains. - // Move its remaining out-of-order elements to the far right. - debug_assert_eq!(width(l, r), block_l); - while start_l < end_l { - // remaining-elements-safety - // SAFETY: while the loop condition holds there are still elements in `offsets_l`, so it - // is safe to point `end_l` to the previous element. - // - // The `ptr::swap` is safe if both its arguments are valid for reads and writes: - // - Per the debug assert above, the distance between `l` and `r` is `block_l` - // elements, so there can be at most `block_l` remaining offsets between `start_l` - // and `end_l`. This means `r` will be moved at most `block_l` steps back, which - // makes the `r.offset` calls valid (at that point `l == r`). - // - `offsets_l` contains valid offsets into `v` collected during the partitioning of - // the last block, so the `l.offset` calls are valid. - unsafe { - end_l = end_l.sub(1); - ptr::swap(l.add(usize::from(*end_l)), r.sub(1)); - r = r.sub(1); - } - } - width(v.as_mut_ptr(), r) - } else if start_r < end_r { - // The right block remains. - // Move its remaining out-of-order elements to the far left. - debug_assert_eq!(width(l, r), block_r); - while start_r < end_r { - // SAFETY: See the reasoning in [remaining-elements-safety]. - unsafe { - end_r = end_r.sub(1); - ptr::swap(l, r.sub(usize::from(*end_r) + 1)); - l = l.add(1); - } - } - width(v.as_mut_ptr(), l) - } else { - // Nothing else to do, we're done. - width(v.as_mut_ptr(), l) - } -} - -/// Partitions `v` into elements smaller than `v[pivot]`, followed by elements greater than or -/// equal to `v[pivot]`. -/// -/// Returns a tuple of: -/// -/// 1. Number of elements smaller than `v[pivot]`. -/// 2. True if `v` was already partitioned. -pub(super) fn partition(v: &mut [T], pivot: usize, is_less: &mut F) -> (usize, bool) -where - F: FnMut(&T, &T) -> bool, -{ - let (mid, was_partitioned) = { - // Place the pivot at the beginning of slice. - v.swap(0, pivot); - let (pivot, v) = v.split_at_mut(1); - let pivot = &mut pivot[0]; - - // Read the pivot into a stack-allocated variable for efficiency. If a following comparison - // operation panics, the pivot will be automatically written back into the slice. - - // SAFETY: `pivot` is a reference to the first element of `v`, so `ptr::read` is safe. - let tmp = mem::ManuallyDrop::new(unsafe { ptr::read(pivot) }); - let _pivot_guard = InsertionHole { src: &*tmp, dest: pivot }; - let pivot = &*tmp; - - // Find the first pair of out-of-order elements. - let mut l = 0; - let mut r = v.len(); - - // SAFETY: The unsafety below involves indexing an array. - // For the first one: We already do the bounds checking here with `l < r`. - // For the second one: We initially have `l == 0` and `r == v.len()` and we checked that `l < r` at every indexing operation. - // From here we know that `r` must be at least `r == l` which was shown to be valid from the first one. - unsafe { - // Find the first element greater than or equal to the pivot. - while l < r && is_less(v.get_unchecked(l), pivot) { - l += 1; - } - - // Find the last element smaller that the pivot. - while l < r && !is_less(v.get_unchecked(r - 1), pivot) { - r -= 1; - } - } - - (l + partition_in_blocks(&mut v[l..r], pivot, is_less), l >= r) - - // `_pivot_guard` goes out of scope and writes the pivot (which is a stack-allocated - // variable) back into the slice where it originally was. This step is critical in ensuring - // safety! - }; - - // Place the pivot between the two partitions. - v.swap(0, mid); - - (mid, was_partitioned) -} - -/// Partitions `v` into elements equal to `v[pivot]` followed by elements greater than `v[pivot]`. -/// -/// Returns the number of elements equal to the pivot. It is assumed that `v` does not contain -/// elements smaller than the pivot. -pub(super) fn partition_equal(v: &mut [T], pivot: usize, is_less: &mut F) -> usize -where - F: FnMut(&T, &T) -> bool, -{ - // Place the pivot at the beginning of slice. - v.swap(0, pivot); - let (pivot, v) = v.split_at_mut(1); - let pivot = &mut pivot[0]; - - // Read the pivot into a stack-allocated variable for efficiency. If a following comparison - // operation panics, the pivot will be automatically written back into the slice. - // SAFETY: The pointer here is valid because it is obtained from a reference to a slice. - let tmp = mem::ManuallyDrop::new(unsafe { ptr::read(pivot) }); - let _pivot_guard = InsertionHole { src: &*tmp, dest: pivot }; - let pivot = &*tmp; - - let len = v.len(); - if len == 0 { - return 0; - } - - // Now partition the slice. - let mut l = 0; - let mut r = len; - loop { - // SAFETY: The unsafety below involves indexing an array. - // For the first one: We already do the bounds checking here with `l < r`. - // For the second one: We initially have `l == 0` and `r == v.len()` and we checked that `l < r` at every indexing operation. - // From here we know that `r` must be at least `r == l` which was shown to be valid from the first one. - unsafe { - // Find the first element greater than the pivot. - while l < r && !is_less(pivot, v.get_unchecked(l)) { - l += 1; - } - - // Find the last element equal to the pivot. - loop { - r -= 1; - if l >= r || !is_less(pivot, v.get_unchecked(r)) { - break; - } - } - - // Are we done? - if l >= r { - break; - } - - // Swap the found pair of out-of-order elements. - let ptr = v.as_mut_ptr(); - ptr::swap(ptr.add(l), ptr.add(r)); - l += 1; - } - } - - // We found `l` elements equal to the pivot. Add 1 to account for the pivot itself. - l + 1 - - // `_pivot_guard` goes out of scope and writes the pivot (which is a stack-allocated variable) - // back into the slice where it originally was. This step is critical in ensuring safety! -} - -/// Scatters some elements around in an attempt to break patterns that might cause imbalanced -/// partitions in quicksort. -#[cold] -pub(super) fn break_patterns(v: &mut [T]) { - let len = v.len(); - if len >= 8 { - let mut seed = len; - let mut gen_usize = || { - // Pseudorandom number generator from the "Xorshift RNGs" paper by George Marsaglia. - if usize::BITS <= 32 { - let mut r = seed as u32; - r ^= r << 13; - r ^= r >> 17; - r ^= r << 5; - seed = r as usize; - seed - } else { - let mut r = seed as u64; - r ^= r << 13; - r ^= r >> 7; - r ^= r << 17; - seed = r as usize; - seed - } - }; - - // Take random numbers modulo this number. - // The number fits into `usize` because `len` is not greater than `isize::MAX`. - let modulus = len.next_power_of_two(); - - // Some pivot candidates will be in the nearby of this index. Let's randomize them. - let pos = len / 4 * 2; - - for i in 0..3 { - // Generate a random number modulo `len`. However, in order to avoid costly operations - // we first take it modulo a power of two, and then decrease by `len` until it fits - // into the range `[0, len - 1]`. - let mut other = gen_usize() & (modulus - 1); - - // `other` is guaranteed to be less than `2 * len`. - if other >= len { - other -= len; - } - - v.swap(pos - 1 + i, other); - } - } -} - -/// Chooses a pivot in `v` and returns the index and `true` if the slice is likely already sorted. -/// -/// Elements in `v` might be reordered in the process. -pub(super) fn choose_pivot(v: &mut [T], is_less: &mut F) -> (usize, bool) -where - F: FnMut(&T, &T) -> bool, -{ - // Minimum length to choose the median-of-medians method. - // Shorter slices use the simple median-of-three method. - const SHORTEST_MEDIAN_OF_MEDIANS: usize = 50; - // Maximum number of swaps that can be performed in this function. - const MAX_SWAPS: usize = 4 * 3; - - let len = v.len(); - - // Three indices near which we are going to choose a pivot. - let mut a = len / 4 * 1; - let mut b = len / 4 * 2; - let mut c = len / 4 * 3; - - // Counts the total number of swaps we are about to perform while sorting indices. - let mut swaps = 0; - - if len >= 8 { - // Swaps indices so that `v[a] <= v[b]`. - // SAFETY: `len >= 8` so there are at least two elements in the neighborhoods of - // `a`, `b` and `c`. This means the three calls to `sort_adjacent` result in - // corresponding calls to `sort3` with valid 3-item neighborhoods around each - // pointer, which in turn means the calls to `sort2` are done with valid - // references. Thus the `v.get_unchecked` calls are safe, as is the `ptr::swap` - // call. - let mut sort2 = |a: &mut usize, b: &mut usize| unsafe { - if is_less(v.get_unchecked(*b), v.get_unchecked(*a)) { - ptr::swap(a, b); - swaps += 1; - } - }; - - // Swaps indices so that `v[a] <= v[b] <= v[c]`. - let mut sort3 = |a: &mut usize, b: &mut usize, c: &mut usize| { - sort2(a, b); - sort2(b, c); - sort2(a, b); - }; - - if len >= SHORTEST_MEDIAN_OF_MEDIANS { - // Finds the median of `v[a - 1], v[a], v[a + 1]` and stores the index into `a`. - let mut sort_adjacent = |a: &mut usize| { - let tmp = *a; - sort3(&mut (tmp - 1), a, &mut (tmp + 1)); - }; - - // Find medians in the neighborhoods of `a`, `b`, and `c`. - sort_adjacent(&mut a); - sort_adjacent(&mut b); - sort_adjacent(&mut c); - } - - // Find the median among `a`, `b`, and `c`. - sort3(&mut a, &mut b, &mut c); - } - - if swaps < MAX_SWAPS { - (b, swaps == 0) - } else { - // The maximum number of swaps was performed. Chances are the slice is descending or mostly - // descending, so reversing will probably help sort it faster. - v.reverse(); - (len - 1 - b, true) - } -} - -/// Sorts `v` recursively. -/// -/// If the slice had a predecessor in the original array, it is specified as `pred`. -/// -/// `limit` is the number of allowed imbalanced partitions before switching to `heapsort`. If zero, -/// this function will immediately switch to heapsort. -fn recurse<'a, T, F>(mut v: &'a mut [T], is_less: &mut F, mut pred: Option<&'a T>, mut limit: u32) -where - F: FnMut(&T, &T) -> bool, -{ - // Slices of up to this length get sorted using insertion sort. - const MAX_INSERTION: usize = 20; - - // True if the last partitioning was reasonably balanced. - let mut was_balanced = true; - // True if the last partitioning didn't shuffle elements (the slice was already partitioned). - let mut was_partitioned = true; - - loop { - let len = v.len(); - - // Very short slices get sorted using insertion sort. - if len <= MAX_INSERTION { - if len >= 2 { - insertion_sort_shift_left(v, 1, is_less); - } - return; - } - - // If too many bad pivot choices were made, simply fall back to heapsort in order to - // guarantee `O(n * log(n))` worst-case. - if limit == 0 { - heapsort(v, is_less); - return; - } - - // If the last partitioning was imbalanced, try breaking patterns in the slice by shuffling - // some elements around. Hopefully we'll choose a better pivot this time. - if !was_balanced { - break_patterns(v); - limit -= 1; - } - - // Choose a pivot and try guessing whether the slice is already sorted. - let (pivot, likely_sorted) = choose_pivot(v, is_less); - - // If the last partitioning was decently balanced and didn't shuffle elements, and if pivot - // selection predicts the slice is likely already sorted... - if was_balanced && was_partitioned && likely_sorted { - // Try identifying several out-of-order elements and shifting them to correct - // positions. If the slice ends up being completely sorted, we're done. - if partial_insertion_sort(v, is_less) { - return; - } - } - - // If the chosen pivot is equal to the predecessor, then it's the smallest element in the - // slice. Partition the slice into elements equal to and elements greater than the pivot. - // This case is usually hit when the slice contains many duplicate elements. - if let Some(p) = pred { - if !is_less(p, &v[pivot]) { - let mid = partition_equal(v, pivot, is_less); - - // Continue sorting elements greater than the pivot. - v = &mut v[mid..]; - continue; - } - } - - // Partition the slice. - let (mid, was_p) = partition(v, pivot, is_less); - was_balanced = cmp::min(mid, len - mid) >= len / 8; - was_partitioned = was_p; - - // Split the slice into `left`, `pivot`, and `right`. - let (left, right) = v.split_at_mut(mid); - let (pivot, right) = right.split_at_mut(1); - let pivot = &pivot[0]; - - // Recurse into the shorter side only in order to minimize the total number of recursive - // calls and consume less stack space. Then just continue with the longer side (this is - // akin to tail recursion). - if left.len() < right.len() { - recurse(left, is_less, pred, limit); - v = right; - pred = Some(pivot); - } else { - recurse(right, is_less, Some(pivot), limit); - v = left; - } - } -} - -/// Sorts `v` using pattern-defeating quicksort, which is *O*(*n* \* log(*n*)) worst-case. -pub fn quicksort(v: &mut [T], mut is_less: F) -where - F: FnMut(&T, &T) -> bool, -{ - // Sorting has no meaningful behavior on zero-sized types. - if T::IS_ZST { - return; - } - - // Limit the number of imbalanced partitions to `floor(log2(len)) + 1`. - let limit = usize::BITS - v.len().leading_zeros(); - - recurse(v, &mut is_less, None, limit); -} - -/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and -/// stores the result into `v[..]`. -/// -/// # Safety -/// -/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough -/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type. -unsafe fn merge(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F) -where - F: FnMut(&T, &T) -> bool, -{ - let len = v.len(); - let v = v.as_mut_ptr(); - - // SAFETY: mid and len must be in-bounds of v. - let (v_mid, v_end) = unsafe { (v.add(mid), v.add(len)) }; - - // The merge process first copies the shorter run into `buf`. Then it traces the newly copied - // run and the longer run forwards (or backwards), comparing their next unconsumed elements and - // copying the lesser (or greater) one into `v`. - // - // As soon as the shorter run is fully consumed, the process is done. If the longer run gets - // consumed first, then we must copy whatever is left of the shorter run into the remaining - // hole in `v`. - // - // Intermediate state of the process is always tracked by `hole`, which serves two purposes: - // 1. Protects integrity of `v` from panics in `is_less`. - // 2. Fills the remaining hole in `v` if the longer run gets consumed first. - // - // Panic safety: - // - // If `is_less` panics at any point during the process, `hole` will get dropped and fill the - // hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every - // object it initially held exactly once. - let mut hole; - - if mid <= len - mid { - // The left run is shorter. - - // SAFETY: buf must have enough capacity for `v[..mid]`. - unsafe { - ptr::copy_nonoverlapping(v, buf, mid); - hole = MergeHole { start: buf, end: buf.add(mid), dest: v }; - } - - // Initially, these pointers point to the beginnings of their arrays. - let left = &mut hole.start; - let mut right = v_mid; - let out = &mut hole.dest; - - while *left < hole.end && right < v_end { - // Consume the lesser side. - // If equal, prefer the left run to maintain stability. - - // SAFETY: left and right must be valid and part of v same for out. - unsafe { - let is_l = is_less(&*right, &**left); - let to_copy = if is_l { right } else { *left }; - ptr::copy_nonoverlapping(to_copy, *out, 1); - *out = out.add(1); - right = right.add(is_l as usize); - *left = left.add(!is_l as usize); - } - } - } else { - // The right run is shorter. - - // SAFETY: buf must have enough capacity for `v[mid..]`. - unsafe { - ptr::copy_nonoverlapping(v_mid, buf, len - mid); - hole = MergeHole { start: buf, end: buf.add(len - mid), dest: v_mid }; - } - - // Initially, these pointers point past the ends of their arrays. - let left = &mut hole.dest; - let right = &mut hole.end; - let mut out = v_end; - - while v < *left && buf < *right { - // Consume the greater side. - // If equal, prefer the right run to maintain stability. - - // SAFETY: left and right must be valid and part of v same for out. - unsafe { - let is_l = is_less(&*right.sub(1), &*left.sub(1)); - *left = left.sub(is_l as usize); - *right = right.sub(!is_l as usize); - let to_copy = if is_l { *left } else { *right }; - out = out.sub(1); - ptr::copy_nonoverlapping(to_copy, out, 1); - } - } - } - // Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of - // it will now be copied into the hole in `v`. - - // When dropped, copies the range `start..end` into `dest..`. - struct MergeHole { - start: *mut T, - end: *mut T, - dest: *mut T, - } - - impl Drop for MergeHole { - fn drop(&mut self) { - // SAFETY: `T` is not a zero-sized type, and these are pointers into a slice's elements. - unsafe { - let len = self.end.sub_ptr(self.start); - ptr::copy_nonoverlapping(self.start, self.dest, len); - } - } - } -} - -/// This merge sort borrows some (but not all) ideas from TimSort, which used to be described in -/// detail [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt). However Python -/// has switched to a Powersort based implementation. -/// -/// The algorithm identifies strictly descending and non-descending subsequences, which are called -/// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed -/// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are -/// satisfied: -/// -/// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len` -/// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len` -/// -/// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case. -pub fn merge_sort( - v: &mut [T], - is_less: &mut CmpF, - elem_alloc_fn: ElemAllocF, - elem_dealloc_fn: ElemDeallocF, - run_alloc_fn: RunAllocF, - run_dealloc_fn: RunDeallocF, -) where - CmpF: FnMut(&T, &T) -> bool, - ElemAllocF: Fn(usize) -> *mut T, - ElemDeallocF: Fn(*mut T, usize), - RunAllocF: Fn(usize) -> *mut TimSortRun, - RunDeallocF: Fn(*mut TimSortRun, usize), -{ - // Slices of up to this length get sorted using insertion sort. - const MAX_INSERTION: usize = 20; - - // The caller should have already checked that. - debug_assert!(!T::IS_ZST); - - let len = v.len(); - - // Short arrays get sorted in-place via insertion sort to avoid allocations. - if len <= MAX_INSERTION { - if len >= 2 { - insertion_sort_shift_left(v, 1, is_less); - } - return; - } - - // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it - // shallow copies of the contents of `v` without risking the dtors running on copies if - // `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run, - // which will always have length at most `len / 2`. - let buf = BufGuard::new(len / 2, elem_alloc_fn, elem_dealloc_fn); - let buf_ptr = buf.buf_ptr.as_ptr(); - - let mut runs = RunVec::new(run_alloc_fn, run_dealloc_fn); - - let mut end = 0; - let mut start = 0; - - // Scan forward. Memory pre-fetching prefers forward scanning vs backwards scanning, and the - // code-gen is usually better. For the most sensitive types such as integers, these are merged - // bidirectionally at once. So there is no benefit in scanning backwards. - while end < len { - let (streak_end, was_reversed) = find_streak(&v[start..], is_less); - end += streak_end; - if was_reversed { - v[start..end].reverse(); - } - - // Insert some more elements into the run if it's too short. Insertion sort is faster than - // merge sort on short sequences, so this significantly improves performance. - end = provide_sorted_batch(v, start, end, is_less); - - // Push this run onto the stack. - runs.push(TimSortRun { start, len: end - start }); - start = end; - - // Merge some pairs of adjacent runs to satisfy the invariants. - while let Some(r) = collapse(runs.as_slice(), len) { - let left = runs[r]; - let right = runs[r + 1]; - let merge_slice = &mut v[left.start..right.start + right.len]; - // SAFETY: `buf_ptr` must hold enough capacity for the shorter of the two sides, and - // neither side may be on length 0. - unsafe { - merge(merge_slice, left.len, buf_ptr, is_less); - } - runs[r + 1] = TimSortRun { start: left.start, len: left.len + right.len }; - runs.remove(r); - } - } - - // Finally, exactly one run must remain in the stack. - debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len); - - // Examines the stack of runs and identifies the next pair of runs to merge. More specifically, - // if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the - // algorithm should continue building a new run instead, `None` is returned. - // - // TimSort is infamous for its buggy implementations, as described here: - // http://envisage-project.eu/timsort-specification-and-verification/ - // - // The gist of the story is: we must enforce the invariants on the top four runs on the stack. - // Enforcing them on just top three is not sufficient to ensure that the invariants will still - // hold for *all* runs in the stack. - // - // This function correctly checks invariants for the top four runs. Additionally, if the top - // run starts at index 0, it will always demand a merge operation until the stack is fully - // collapsed, in order to complete the sort. - #[inline] - fn collapse(runs: &[TimSortRun], stop: usize) -> Option { - let n = runs.len(); - if n >= 2 - && (runs[n - 1].start + runs[n - 1].len == stop - || runs[n - 2].len <= runs[n - 1].len - || (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len) - || (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len)) - { - if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) } - } else { - None - } - } - - // Extremely basic versions of Vec. - // Their use is super limited and by having the code here, it allows reuse between the sort - // implementations. - struct BufGuard - where - ElemDeallocF: Fn(*mut T, usize), - { - buf_ptr: ptr::NonNull, - capacity: usize, - elem_dealloc_fn: ElemDeallocF, - } - - impl BufGuard - where - ElemDeallocF: Fn(*mut T, usize), - { - fn new( - len: usize, - elem_alloc_fn: ElemAllocF, - elem_dealloc_fn: ElemDeallocF, - ) -> Self - where - ElemAllocF: Fn(usize) -> *mut T, - { - Self { - buf_ptr: ptr::NonNull::new(elem_alloc_fn(len)).unwrap(), - capacity: len, - elem_dealloc_fn, - } - } - } - - impl Drop for BufGuard - where - ElemDeallocF: Fn(*mut T, usize), - { - fn drop(&mut self) { - (self.elem_dealloc_fn)(self.buf_ptr.as_ptr(), self.capacity); - } - } - - struct RunVec - where - RunAllocF: Fn(usize) -> *mut TimSortRun, - RunDeallocF: Fn(*mut TimSortRun, usize), - { - buf_ptr: ptr::NonNull, - capacity: usize, - len: usize, - run_alloc_fn: RunAllocF, - run_dealloc_fn: RunDeallocF, - } - - impl RunVec - where - RunAllocF: Fn(usize) -> *mut TimSortRun, - RunDeallocF: Fn(*mut TimSortRun, usize), - { - fn new(run_alloc_fn: RunAllocF, run_dealloc_fn: RunDeallocF) -> Self { - // Most slices can be sorted with at most 16 runs in-flight. - const START_RUN_CAPACITY: usize = 16; - - Self { - buf_ptr: ptr::NonNull::new(run_alloc_fn(START_RUN_CAPACITY)).unwrap(), - capacity: START_RUN_CAPACITY, - len: 0, - run_alloc_fn, - run_dealloc_fn, - } - } - - fn push(&mut self, val: TimSortRun) { - if self.len == self.capacity { - let old_capacity = self.capacity; - let old_buf_ptr = self.buf_ptr.as_ptr(); - - self.capacity = self.capacity * 2; - self.buf_ptr = ptr::NonNull::new((self.run_alloc_fn)(self.capacity)).unwrap(); - - // SAFETY: buf_ptr new and old were correctly allocated and old_buf_ptr has - // old_capacity valid elements. - unsafe { - ptr::copy_nonoverlapping(old_buf_ptr, self.buf_ptr.as_ptr(), old_capacity); - } - - (self.run_dealloc_fn)(old_buf_ptr, old_capacity); - } - - // SAFETY: The invariant was just checked. - unsafe { - self.buf_ptr.as_ptr().add(self.len).write(val); - } - self.len += 1; - } - - fn remove(&mut self, index: usize) { - if index >= self.len { - panic!("Index out of bounds"); - } - - // SAFETY: buf_ptr needs to be valid and len invariant upheld. - unsafe { - // the place we are taking from. - let ptr = self.buf_ptr.as_ptr().add(index); - - // Shift everything down to fill in that spot. - ptr::copy(ptr.add(1), ptr, self.len - index - 1); - } - self.len -= 1; - } - - fn as_slice(&self) -> &[TimSortRun] { - // SAFETY: Safe as long as buf_ptr is valid and len invariant was upheld. - unsafe { &*ptr::slice_from_raw_parts(self.buf_ptr.as_ptr(), self.len) } - } - - fn len(&self) -> usize { - self.len - } - } - - impl core::ops::Index for RunVec - where - RunAllocF: Fn(usize) -> *mut TimSortRun, - RunDeallocF: Fn(*mut TimSortRun, usize), - { - type Output = TimSortRun; - - fn index(&self, index: usize) -> &Self::Output { - if index < self.len { - // SAFETY: buf_ptr and len invariant must be upheld. - unsafe { - return &*(self.buf_ptr.as_ptr().add(index)); - } - } - - panic!("Index out of bounds"); - } - } - - impl core::ops::IndexMut for RunVec - where - RunAllocF: Fn(usize) -> *mut TimSortRun, - RunDeallocF: Fn(*mut TimSortRun, usize), - { - fn index_mut(&mut self, index: usize) -> &mut Self::Output { - if index < self.len { - // SAFETY: buf_ptr and len invariant must be upheld. - unsafe { - return &mut *(self.buf_ptr.as_ptr().add(index)); - } - } - - panic!("Index out of bounds"); - } - } - - impl Drop for RunVec - where - RunAllocF: Fn(usize) -> *mut TimSortRun, - RunDeallocF: Fn(*mut TimSortRun, usize), - { - fn drop(&mut self) { - // As long as TimSortRun is Copy we don't need to drop them individually but just the - // whole allocation. - (self.run_dealloc_fn)(self.buf_ptr.as_ptr(), self.capacity); - } - } -} - -/// Internal type used by merge_sort. -#[derive(Clone, Copy, Debug)] -pub struct TimSortRun { - len: usize, - start: usize, -} - -/// Takes a range as denoted by start and end, that is already sorted and extends it to the right if -/// necessary with sorts optimized for smaller ranges such as insertion sort. -fn provide_sorted_batch(v: &mut [T], start: usize, mut end: usize, is_less: &mut F) -> usize -where - F: FnMut(&T, &T) -> bool, -{ - let len = v.len(); - assert!(end >= start && end <= len); - - // This value is a balance between least comparisons and best performance, as - // influenced by for example cache locality. - const MIN_INSERTION_RUN: usize = 10; - - // Insert some more elements into the run if it's too short. Insertion sort is faster than - // merge sort on short sequences, so this significantly improves performance. - let start_end_diff = end - start; - - if start_end_diff < MIN_INSERTION_RUN && end < len { - // v[start_found..end] are elements that are already sorted in the input. We want to extend - // the sorted region to the left, so we push up MIN_INSERTION_RUN - 1 to the right. Which is - // more efficient that trying to push those already sorted elements to the left. - end = cmp::min(start + MIN_INSERTION_RUN, len); - let presorted_start = cmp::max(start_end_diff, 1); - - insertion_sort_shift_left(&mut v[start..end], presorted_start, is_less); - } - - end -} - -/// Finds a streak of presorted elements starting at the beginning of the slice. Returns the first -/// value that is not part of said streak, and a bool denoting whether the streak was reversed. -/// Streaks can be increasing or decreasing. -fn find_streak(v: &[T], is_less: &mut F) -> (usize, bool) -where - F: FnMut(&T, &T) -> bool, -{ - let len = v.len(); - - if len < 2 { - return (len, false); - } - - let mut end = 2; - - // SAFETY: See below specific. - unsafe { - // SAFETY: We checked that len >= 2, so 0 and 1 are valid indices. - let assume_reverse = is_less(v.get_unchecked(1), v.get_unchecked(0)); - - // SAFETY: We know end >= 2 and check end < len. - // From that follows that accessing v at end and end - 1 is safe. - if assume_reverse { - while end < len && is_less(v.get_unchecked(end), v.get_unchecked(end - 1)) { - end += 1; - } - - (end, true) - } else { - while end < len && !is_less(v.get_unchecked(end), v.get_unchecked(end - 1)) { - end += 1; - } - (end, false) - } - } -} diff --git a/library/core/src/slice/sort/mod.rs b/library/core/src/slice/sort/mod.rs new file mode 100644 index 0000000000000..79852708b81ea --- /dev/null +++ b/library/core/src/slice/sort/mod.rs @@ -0,0 +1,8 @@ +//! This module and the contained sub-modules contains the code for efficient and robust sort +//! implementations, as well as the domain adjacent implementation of `select_nth_unstable`. + +pub mod stable; +pub mod unstable; + +pub(crate) mod select; +pub(crate) mod shared; diff --git a/library/core/src/slice/select.rs b/library/core/src/slice/sort/select.rs similarity index 77% rename from library/core/src/slice/select.rs rename to library/core/src/slice/sort/select.rs index ffc193578e075..e0c1085916e5e 100644 --- a/library/core/src/slice/select.rs +++ b/library/core/src/slice/sort/select.rs @@ -1,45 +1,78 @@ -//! Slice selection -//! //! This module contains the implementation for `slice::select_nth_unstable`. -//! It uses an introselect algorithm based on Orson Peters' pattern-defeating quicksort, -//! published at: +//! It uses an introselect algorithm based on ipnsort by Lukas Bergdoll and Orson Peters, +//! published at: //! //! The fallback algorithm used for introselect is Median of Medians using Tukey's Ninther //! for pivot selection. Using this as a fallback ensures O(n) worst case running time with //! better performance than one would get using heapsort as fallback. -use crate::cmp; use crate::mem::{self, SizedTypeProperties}; -use crate::slice::sort::{ - break_patterns, choose_pivot, insertion_sort_shift_left, partition, partition_equal, -}; -// For slices of up to this length it's probably faster to simply sort them. -// Defined at the module scope because it's used in multiple functions. -const MAX_INSERTION: usize = 10; +use crate::slice::sort::shared::pivot::choose_pivot; +use crate::slice::sort::shared::smallsort::insertion_sort_shift_left; +use crate::slice::sort::unstable::quicksort::partition; + +/// Reorder the slice such that the element at `index` is at its final sorted position. +pub(crate) fn partition_at_index( + v: &mut [T], + index: usize, + mut is_less: F, +) -> (&mut [T], &mut T, &mut [T]) +where + F: FnMut(&T, &T) -> bool, +{ + let len = v.len(); + + // Puts a lower limit of 1 on `len`. + if index >= len { + panic!("partition_at_index index {} greater than length of slice {}", index, len); + } + + if T::IS_ZST { + // Sorting has no meaningful behavior on zero-sized types. Do nothing. + } else if index == len - 1 { + // Find max element and place it in the last position of the array. We're free to use + // `unwrap()` here because we checked that `v` is not empty. + let max_idx = max_index(v, &mut is_less).unwrap(); + v.swap(max_idx, index); + } else if index == 0 { + // Find min element and place it in the first position of the array. We're free to use + // `unwrap()` here because we checked that `v` is not empty. + let min_idx = min_index(v, &mut is_less).unwrap(); + v.swap(min_idx, index); + } else { + partition_at_index_loop(v, index, None, &mut is_less); + } + + let (left, right) = v.split_at_mut(index); + let (pivot, right) = right.split_at_mut(1); + let pivot = &mut pivot[0]; + (left, pivot, right) +} + +// For small sub-slices it's faster to use a dedicated small-sort, but because it is only called at +// most once, it doesn't make sense to use something more sophisticated than insertion-sort. +const INSERTION_SORT_THRESHOLD: usize = 16; fn partition_at_index_loop<'a, T, F>( mut v: &'a mut [T], mut index: usize, + mut ancestor_pivot: Option<&'a T>, is_less: &mut F, - mut pred: Option<&'a T>, ) where F: FnMut(&T, &T) -> bool, { - // Limit the amount of iterations and fall back to fast deterministic selection - // to ensure O(n) worst case running time. This limit needs to be constant, because - // using `ilog2(len)` like in `sort` would result in O(n log n) time complexity. - // The exact value of the limit is chosen somewhat arbitrarily, but for most inputs bad pivot - // selections should be relatively rare, so the limit usually shouldn't be reached - // anyways. + // Limit the amount of iterations and fall back to fast deterministic selection to ensure O(n) + // worst case running time. This limit needs to be constant, because using `ilog2(len)` like in + // `sort` would result in O(n log n) time complexity. The exact value of the limit is chosen + // somewhat arbitrarily, but for most inputs bad pivot selections should be relatively rare, so + // the limit is reached for sub-slices len / (2^limit or less). Which makes the remaining work + // with the fallback minimal in relative terms. let mut limit = 16; - // True if the last partitioning was reasonably balanced. - let mut was_balanced = true; - loop { - if v.len() <= MAX_INSERTION { - if v.len() > 1 { + if v.len() <= INSERTION_SORT_THRESHOLD { + if v.len() >= 2 { insertion_sort_shift_left(v, 1, is_less); } return; @@ -50,38 +83,35 @@ fn partition_at_index_loop<'a, T, F>( return; } - // If the last partitioning was imbalanced, try breaking patterns in the slice by shuffling - // some elements around. Hopefully we'll choose a better pivot this time. - if !was_balanced { - break_patterns(v); - limit -= 1; - } + limit -= 1; // Choose a pivot - let (pivot, _) = choose_pivot(v, is_less); + let pivot_pos = choose_pivot(v, is_less); // If the chosen pivot is equal to the predecessor, then it's the smallest element in the // slice. Partition the slice into elements equal to and elements greater than the pivot. // This case is usually hit when the slice contains many duplicate elements. - if let Some(p) = pred { - if !is_less(p, &v[pivot]) { - let mid = partition_equal(v, pivot, is_less); + if let Some(p) = ancestor_pivot { + if !is_less(p, unsafe { v.get_unchecked(pivot_pos) }) { + let num_lt = partition(v, pivot_pos, &mut |a, b| !is_less(b, a)); + + // Continue sorting elements greater than the pivot. We know that `mid` contains + // the pivot. So we can continue after `mid`. + let mid = num_lt + 1; // If we've passed our index, then we're good. if mid > index { return; } - // Otherwise, continue sorting elements greater than the pivot. v = &mut v[mid..]; index = index - mid; - pred = None; + ancestor_pivot = None; continue; } } - let (mid, _) = partition(v, pivot, is_less); - was_balanced = cmp::min(mid, v.len() - mid) >= v.len() / 8; + let mid = partition(v, pivot_pos, is_less); // Split the slice into `left`, `pivot`, and `right`. let (left, right) = v.split_at_mut(mid); @@ -91,7 +121,7 @@ fn partition_at_index_loop<'a, T, F>( if mid < index { v = right; index = index - mid - 1; - pred = Some(pivot); + ancestor_pivot = Some(pivot); } else if mid > index { v = left; } else { @@ -122,41 +152,6 @@ fn max_index bool>(slice: &[T], is_less: &mut F) -> Optio .map(|(i, _)| i) } -/// Reorder the slice such that the element at `index` is at its final sorted position. -pub fn partition_at_index( - v: &mut [T], - index: usize, - mut is_less: F, -) -> (&mut [T], &mut T, &mut [T]) -where - F: FnMut(&T, &T) -> bool, -{ - if index >= v.len() { - panic!("partition_at_index index {} greater than length of slice {}", index, v.len()); - } - - if T::IS_ZST { - // Sorting has no meaningful behavior on zero-sized types. Do nothing. - } else if index == v.len() - 1 { - // Find max element and place it in the last position of the array. We're free to use - // `unwrap()` here because we know v must not be empty. - let max_idx = max_index(v, &mut is_less).unwrap(); - v.swap(max_idx, index); - } else if index == 0 { - // Find min element and place it in the first position of the array. We're free to use - // `unwrap()` here because we know v must not be empty. - let min_idx = min_index(v, &mut is_less).unwrap(); - v.swap(min_idx, index); - } else { - partition_at_index_loop(v, index, &mut is_less, None); - } - - let (left, right) = v.split_at_mut(index); - let (pivot, right) = right.split_at_mut(1); - let pivot = &mut pivot[0]; - (left, pivot, right) -} - /// Selection algorithm to select the k-th element from the slice in guaranteed O(n) time. /// This is essentially a quickselect that uses Tukey's Ninther for pivot selection fn median_of_medians bool>(mut v: &mut [T], is_less: &mut F, mut k: usize) { @@ -168,8 +163,8 @@ fn median_of_medians bool>(mut v: &mut [T], is_less: &mut // We now know that `k < v.len() <= isize::MAX` loop { - if v.len() <= MAX_INSERTION { - if v.len() > 1 { + if v.len() <= INSERTION_SORT_THRESHOLD { + if v.len() >= 2 { insertion_sort_shift_left(v, 1, is_less); } return; @@ -232,7 +227,8 @@ fn median_of_ninthers bool>(v: &mut [T], is_less: &mut F) } median_of_medians(&mut v[lo..lo + frac], is_less, pivot); - partition(v, lo + pivot, is_less).0 + + partition(v, lo + pivot, is_less) } /// Moves around the 9 elements at the indices a..i, such that diff --git a/library/core/src/slice/sort/shared/mod.rs b/library/core/src/slice/sort/shared/mod.rs new file mode 100644 index 0000000000000..ad1171bfc6a0a --- /dev/null +++ b/library/core/src/slice/sort/shared/mod.rs @@ -0,0 +1,45 @@ +use crate::marker::Freeze; + +pub(crate) mod pivot; +pub(crate) mod smallsort; + +/// SAFETY: this is safety relevant, how does this interact with the soundness holes in +/// specialization? +#[rustc_unsafe_specialization_marker] +pub(crate) trait FreezeMarker {} + +impl FreezeMarker for T {} + +/// Finds a run of sorted elements starting at the beginning of the slice. +/// +/// Returns the length of the run, and a bool that is false when the run +/// is ascending, and true if the run strictly descending. +#[inline(always)] +pub(crate) fn find_existing_run bool>( + v: &[T], + is_less: &mut F, +) -> (usize, bool) { + let len = v.len(); + if len < 2 { + return (len, false); + } + + // SAFETY: We checked that len >= 2, so 0 and 1 are valid indices. + // This also means that run_len < len implies run_len and run_len - 1 + // are valid indices as well. + unsafe { + let mut run_len = 2; + let strictly_descending = is_less(v.get_unchecked(1), v.get_unchecked(0)); + if strictly_descending { + while run_len < len && is_less(v.get_unchecked(run_len), v.get_unchecked(run_len - 1)) { + run_len += 1; + } + } else { + while run_len < len && !is_less(v.get_unchecked(run_len), v.get_unchecked(run_len - 1)) + { + run_len += 1; + } + } + (run_len, strictly_descending) + } +} diff --git a/library/core/src/slice/sort/shared/pivot.rs b/library/core/src/slice/sort/shared/pivot.rs new file mode 100644 index 0000000000000..255a1eb6c88a8 --- /dev/null +++ b/library/core/src/slice/sort/shared/pivot.rs @@ -0,0 +1,88 @@ +//! This module contains the logic for pivot selection. + +use crate::intrinsics; + +// Recursively select a pseudomedian if above this threshold. +const PSEUDO_MEDIAN_REC_THRESHOLD: usize = 64; + +/// Selects a pivot from `v`. Algorithm taken from glidesort by Orson Peters. +/// +/// This chooses a pivot by sampling an adaptive amount of points, approximating +/// the quality of a median of sqrt(n) elements. +pub fn choose_pivot bool>(v: &[T], is_less: &mut F) -> usize { + // We use unsafe code and raw pointers here because we're dealing with + // heavy recursion. Passing safe slices around would involve a lot of + // branches and function call overhead. + + let len = v.len(); + if len < 8 { + intrinsics::abort(); + } + + // SAFETY: a, b, c point to initialized regions of len_div_8 elements, + // satisfying median3 and median3_rec's preconditions as v_base points + // to an initialized region of n = len elements. + unsafe { + let v_base = v.as_ptr(); + let len_div_8 = len / 8; + + let a = v_base; // [0, floor(n/8)) + let b = v_base.add(len_div_8 * 4); // [4*floor(n/8), 5*floor(n/8)) + let c = v_base.add(len_div_8 * 7); // [7*floor(n/8), 8*floor(n/8)) + + if len < PSEUDO_MEDIAN_REC_THRESHOLD { + median3(&*a, &*b, &*c, is_less).sub_ptr(v_base) + } else { + median3_rec(a, b, c, len_div_8, is_less).sub_ptr(v_base) + } + } +} + +/// Calculates an approximate median of 3 elements from sections a, b, c, or +/// recursively from an approximation of each, if they're large enough. By +/// dividing the size of each section by 8 when recursing we have logarithmic +/// recursion depth and overall sample from f(n) = 3*f(n/8) -> f(n) = +/// O(n^(log(3)/log(8))) ~= O(n^0.528) elements. +/// +/// SAFETY: a, b, c must point to the start of initialized regions of memory of +/// at least n elements. +unsafe fn median3_rec bool>( + mut a: *const T, + mut b: *const T, + mut c: *const T, + n: usize, + is_less: &mut F, +) -> *const T { + // SAFETY: a, b, c still point to initialized regions of n / 8 elements, + // by the exact same logic as in choose_pivot. + unsafe { + if n * 8 >= PSEUDO_MEDIAN_REC_THRESHOLD { + let n8 = n / 8; + a = median3_rec(a, a.add(n8 * 4), a.add(n8 * 7), n8, is_less); + b = median3_rec(b, b.add(n8 * 4), b.add(n8 * 7), n8, is_less); + c = median3_rec(c, c.add(n8 * 4), c.add(n8 * 7), n8, is_less); + } + median3(&*a, &*b, &*c, is_less) + } +} + +/// Calculates the median of 3 elements. +/// +/// SAFETY: a, b, c must be valid initialized elements. +#[inline(always)] +fn median3 bool>(a: &T, b: &T, c: &T, is_less: &mut F) -> *const T { + // Compiler tends to make this branchless when sensible, and avoids the + // third comparison when not. + let x = is_less(a, b); + let y = is_less(a, c); + if x == y { + // If x=y=0 then b, c <= a. In this case we want to return max(b, c). + // If x=y=1 then a < b, c. In this case we want to return min(b, c). + // By toggling the outcome of b < c using XOR x we get this behavior. + let z = is_less(b, c); + if z ^ x { c } else { b } + } else { + // Either c <= a < b or b <= a < c, thus a is our median. + a + } +} diff --git a/library/core/src/slice/sort/shared/smallsort.rs b/library/core/src/slice/sort/shared/smallsort.rs new file mode 100644 index 0000000000000..8dbd45a389ca3 --- /dev/null +++ b/library/core/src/slice/sort/shared/smallsort.rs @@ -0,0 +1,843 @@ +//! This module contains a variety of sort implementations that are optimized for small lengths. + +use crate::intrinsics; +use crate::mem::{self, ManuallyDrop, MaybeUninit}; +use crate::ptr; +use crate::slice; + +use crate::slice::sort::shared::FreezeMarker; + +// It's important to differentiate between SMALL_SORT_THRESHOLD performance for +// small slices and small-sort performance sorting small sub-slices as part of +// the main quicksort loop. For the former, testing showed that the +// representative benchmarks for real-world performance are cold CPU state and +// not single-size hot benchmarks. For the latter the CPU will call them many +// times, so hot benchmarks are fine and more realistic. And it's worth it to +// optimize sorting small sub-slices with more sophisticated solutions than +// insertion sort. + +/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe +/// abstractions. +pub(crate) trait StableSmallSortTypeImpl: Sized { + /// For which input length <= return value of this function, is it valid to call `small_sort`. + fn small_sort_threshold() -> usize; + + /// Sorts `v` using strategies optimized for small sizes. + fn small_sort bool>( + v: &mut [Self], + scratch: &mut [MaybeUninit], + is_less: &mut F, + ); +} + +impl StableSmallSortTypeImpl for T { + #[inline(always)] + default fn small_sort_threshold() -> usize { + // Optimal number of comparisons, and good perf. + SMALL_SORT_FALLBACK_THRESHOLD + } + + #[inline(always)] + default fn small_sort bool>( + v: &mut [T], + _scratch: &mut [MaybeUninit], + is_less: &mut F, + ) { + if v.len() >= 2 { + insertion_sort_shift_left(v, 1, is_less); + } + } +} + +impl StableSmallSortTypeImpl for T { + #[inline(always)] + fn small_sort_threshold() -> usize { + SMALL_SORT_GENERAL_THRESHOLD + } + + #[inline(always)] + fn small_sort bool>( + v: &mut [T], + scratch: &mut [MaybeUninit], + is_less: &mut F, + ) { + small_sort_general_with_scratch(v, scratch, is_less); + } +} + +/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe +/// abstractions. +pub(crate) trait UnstableSmallSortTypeImpl: Sized { + /// For which input length <= return value of this function, is it valid to call `small_sort`. + fn small_sort_threshold() -> usize; + + /// Sorts `v` using strategies optimized for small sizes. + fn small_sort bool>(v: &mut [Self], is_less: &mut F); +} + +impl UnstableSmallSortTypeImpl for T { + #[inline(always)] + default fn small_sort_threshold() -> usize { + SMALL_SORT_FALLBACK_THRESHOLD + } + + #[inline(always)] + default fn small_sort(v: &mut [T], is_less: &mut F) + where + F: FnMut(&T, &T) -> bool, + { + small_sort_fallback(v, is_less); + } +} + +impl UnstableSmallSortTypeImpl for T { + #[inline(always)] + fn small_sort_threshold() -> usize { + match const { choose_unstable_small_sort::() } { + UnstalbeSmallSort::Fallback => SMALL_SORT_FALLBACK_THRESHOLD, + UnstalbeSmallSort::General => SMALL_SORT_GENERAL_THRESHOLD, + UnstalbeSmallSort::Network => SMALL_SORT_NETWORK_THRESHOLD, + } + } + + #[inline(always)] + fn small_sort(v: &mut [T], is_less: &mut F) + where + F: FnMut(&T, &T) -> bool, + { + // This construct is used to limit the LLVM IR generated, which saves large amounts of + // compile-time by only instantiating the code that is needed. Idea by Frank Steffahn. + (const { inst_unstable_small_sort::() })(v, is_less); + } +} + +/// Optimal number of comparisons, and good perf. +const SMALL_SORT_FALLBACK_THRESHOLD: usize = 16; + +/// From a comparison perspective 20 was ~2% more efficient for fully random input, but for +/// wall-clock performance choosing 32 yielded better performance overall. +/// +/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] ! +const SMALL_SORT_GENERAL_THRESHOLD: usize = 32; + +/// [`small_sort_general`] uses [`sort8_stable`] as primitive and does a kind of ping-pong merge, +/// where the output of the first two [`sort8_stable`] calls is stored at the end of the scratch +/// buffer. This simplifies panic handling and avoids additional copies. This affects the required +/// scratch buffer size. +/// +/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] ! +pub(crate) const SMALL_SORT_GENERAL_SCRATCH_LEN: usize = SMALL_SORT_GENERAL_THRESHOLD + 16; + +/// SAFETY: If you change this value, you have to adjust [`small_sort_network`] ! +const SMALL_SORT_NETWORK_THRESHOLD: usize = 32; +const SMALL_SORT_NETWORK_SCRATCH_LEN: usize = SMALL_SORT_NETWORK_THRESHOLD; + +/// Using a stack array, could cause a stack overflow if the type `T` is very large. To be +/// conservative we limit the usage of small-sorts that require a stack array to types that fit +/// within this limit. +const MAX_STACK_ARRAY_SIZE: usize = 4096; + +enum UnstalbeSmallSort { + Fallback, + General, + Network, +} + +const fn choose_unstable_small_sort() -> UnstalbeSmallSort { + if T::is_copy() + && has_efficient_in_place_swap::() + && (mem::size_of::() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE + { + // Heuristic for int like types. + return UnstalbeSmallSort::Network; + } + + if (mem::size_of::() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE { + return UnstalbeSmallSort::General; + } + + UnstalbeSmallSort::Fallback +} + +const fn inst_unstable_small_sort bool>() +-> fn(&mut [T], &mut F) { + match const { choose_unstable_small_sort::() } { + UnstalbeSmallSort::Fallback => small_sort_fallback::, + UnstalbeSmallSort::General => small_sort_general::, + UnstalbeSmallSort::Network => small_sort_network::, + } +} + +fn small_sort_fallback bool>(v: &mut [T], is_less: &mut F) { + if v.len() >= 2 { + insertion_sort_shift_left(v, 1, is_less); + } +} + +fn small_sort_general bool>(v: &mut [T], is_less: &mut F) { + let mut stack_array = MaybeUninit::<[T; SMALL_SORT_GENERAL_SCRATCH_LEN]>::uninit(); + + let scratch = unsafe { + slice::from_raw_parts_mut( + stack_array.as_mut_ptr() as *mut MaybeUninit, + SMALL_SORT_GENERAL_SCRATCH_LEN, + ) + }; + + small_sort_general_with_scratch(v, scratch, is_less); +} + +fn small_sort_general_with_scratch bool>( + v: &mut [T], + scratch: &mut [MaybeUninit], + is_less: &mut F, +) { + let len = v.len(); + if len < 2 { + return; + } + + if scratch.len() < len + 16 { + intrinsics::abort(); + } + + let v_base = v.as_mut_ptr(); + let len_div_2 = len / 2; + + // SAFETY: See individual comments. + unsafe { + let scratch_base = scratch.as_mut_ptr() as *mut T; + + let presorted_len = if const { mem::size_of::() <= 16 } && len >= 16 { + // SAFETY: scratch_base is valid and has enough space. + sort8_stable(v_base, scratch_base, scratch_base.add(len), is_less); + sort8_stable( + v_base.add(len_div_2), + scratch_base.add(len_div_2), + scratch_base.add(len + 8), + is_less, + ); + + 8 + } else if len >= 8 { + // SAFETY: scratch_base is valid and has enough space. + sort4_stable(v_base, scratch_base, is_less); + sort4_stable(v_base.add(len_div_2), scratch_base.add(len_div_2), is_less); + + 4 + } else { + ptr::copy_nonoverlapping(v_base, scratch_base, 1); + ptr::copy_nonoverlapping(v_base.add(len_div_2), scratch_base.add(len_div_2), 1); + + 1 + }; + + for offset in [0, len_div_2] { + // SAFETY: at this point dst is initialized with presorted_len elements. + // We extend this to desired_len, src is valid for desired_len elements. + let src = v_base.add(offset); + let dst = scratch_base.add(offset); + let desired_len = if offset == 0 { len_div_2 } else { len - len_div_2 }; + + for i in presorted_len..desired_len { + ptr::copy_nonoverlapping(src.add(i), dst.add(i), 1); + insert_tail(dst, dst.add(i), is_less); + } + } + + // SAFETY: see comment in `CopyOnDrop::drop`. + let drop_guard = CopyOnDrop { src: scratch_base, dst: v_base, len }; + + // SAFETY: at this point scratch_base is fully initialized, allowing us + // to use it as the source of our merge back into the original array. + // If a panic occurs we ensure the original array is restored to a valid + // permutation of the input through drop_guard. This technique is similar + // to ping-pong merging. + bidirectional_merge( + &*ptr::slice_from_raw_parts(drop_guard.src, drop_guard.len), + drop_guard.dst, + is_less, + ); + mem::forget(drop_guard); + } +} + +struct CopyOnDrop { + src: *const T, + dst: *mut T, + len: usize, +} + +impl Drop for CopyOnDrop { + fn drop(&mut self) { + // SAFETY: `src` must contain `len` initialized elements, and dst must + // be valid to write `len` elements. + unsafe { + ptr::copy_nonoverlapping(self.src, self.dst, self.len); + } + } +} + +fn small_sort_network(v: &mut [T], is_less: &mut F) +where + T: FreezeMarker, + F: FnMut(&T, &T) -> bool, +{ + // This implementation is tuned to be efficient for integer types. + + let len = v.len(); + if len < 2 { + return; + } + + if len > SMALL_SORT_NETWORK_SCRATCH_LEN { + intrinsics::abort(); + } + + let mut stack_array = MaybeUninit::<[T; SMALL_SORT_NETWORK_SCRATCH_LEN]>::uninit(); + + let len_div_2 = len / 2; + let no_merge = len < 18; + + let v_base = v.as_mut_ptr(); + let initial_region_len = if no_merge { len } else { len_div_2 }; + // SAFETY: Both possible values of `initial_region_len` are in-bounds. + let mut region = unsafe { &mut *ptr::slice_from_raw_parts_mut(v_base, initial_region_len) }; + + // Avoid compiler unrolling, we *really* don't want that to happen here for binary-size reasons. + loop { + let presorted_len = if region.len() >= 13 { + sort13_optimal(region, is_less); + 13 + } else if region.len() >= 9 { + sort9_optimal(region, is_less); + 9 + } else { + 1 + }; + + insertion_sort_shift_left(region, presorted_len, is_less); + + if no_merge { + return; + } + + if region.as_ptr() != v_base { + break; + } + + // SAFETY: The right side of `v` based on `len_div_2` is guaranteed in-bounds. + region = + unsafe { &mut *ptr::slice_from_raw_parts_mut(v_base.add(len_div_2), len - len_div_2) }; + } + + // SAFETY: We checked that T is Freeze and thus observation safe. + // Should is_less panic v was not modified in parity_merge and retains it's original input. + // scratch and v must not alias and scratch has v.len() space. + unsafe { + let scratch_base = stack_array.as_mut_ptr() as *mut T; + bidirectional_merge( + &mut *ptr::slice_from_raw_parts_mut(v_base, len), + scratch_base, + is_less, + ); + ptr::copy_nonoverlapping(scratch_base, v_base, len); + } +} + +/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the +/// value at position `b_pos` is less than the one at position `a_pos`. +pub unsafe fn swap_if_less(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F) +where + F: FnMut(&T, &T) -> bool, +{ + // SAFETY: the caller must guarantee that `a` and `b` each added to `v_base` yield valid + // pointers into `v_base`, and are properly aligned, and part of the same allocation. + unsafe { + let v_a = v_base.add(a_pos); + let v_b = v_base.add(b_pos); + + // PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be + // in a well defined state, without duplicates. + + // Important to only swap if it is more and not if it is equal. is_less should return false for + // equal, so we don't swap. + let should_swap = is_less(&*v_b, &*v_a); + + // This is a branchless version of swap if. + // The equivalent code with a branch would be: + // + // if should_swap { + // ptr::swap(left, right, 1); + // } + + // The goal is to generate cmov instructions here. + let left_swap = if should_swap { v_b } else { v_a }; + let right_swap = if should_swap { v_a } else { v_b }; + + let right_swap_tmp = ManuallyDrop::new(ptr::read(right_swap)); + ptr::copy(left_swap, v_a, 1); + ptr::copy_nonoverlapping(&*right_swap_tmp, v_b, 1); + } +} + +// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no +// performance impact. +fn sort9_optimal(v: &mut [T], is_less: &mut F) +where + F: FnMut(&T, &T) -> bool, +{ + if v.len() < 9 { + intrinsics::abort(); + } + + let v_base = v.as_mut_ptr(); + + // Optimal sorting network see: + // https://bertdobbelaere.github.io/sorting_networks.html. + + // SAFETY: We checked the len. + unsafe { + swap_if_less(v_base, 0, 3, is_less); + swap_if_less(v_base, 1, 7, is_less); + swap_if_less(v_base, 2, 5, is_less); + swap_if_less(v_base, 4, 8, is_less); + swap_if_less(v_base, 0, 7, is_less); + swap_if_less(v_base, 2, 4, is_less); + swap_if_less(v_base, 3, 8, is_less); + swap_if_less(v_base, 5, 6, is_less); + swap_if_less(v_base, 0, 2, is_less); + swap_if_less(v_base, 1, 3, is_less); + swap_if_less(v_base, 4, 5, is_less); + swap_if_less(v_base, 7, 8, is_less); + swap_if_less(v_base, 1, 4, is_less); + swap_if_less(v_base, 3, 6, is_less); + swap_if_less(v_base, 5, 7, is_less); + swap_if_less(v_base, 0, 1, is_less); + swap_if_less(v_base, 2, 4, is_less); + swap_if_less(v_base, 3, 5, is_less); + swap_if_less(v_base, 6, 8, is_less); + swap_if_less(v_base, 2, 3, is_less); + swap_if_less(v_base, 4, 5, is_less); + swap_if_less(v_base, 6, 7, is_less); + swap_if_less(v_base, 1, 2, is_less); + swap_if_less(v_base, 3, 4, is_less); + swap_if_less(v_base, 5, 6, is_less); + } +} + +// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no +// performance impact. +fn sort13_optimal(v: &mut [T], is_less: &mut F) +where + F: FnMut(&T, &T) -> bool, +{ + if v.len() < 13 { + intrinsics::abort(); + } + + let v_base = v.as_mut_ptr(); + + // Optimal sorting network see: + // https://bertdobbelaere.github.io/sorting_networks.html. + + // SAFETY: We checked the len. + unsafe { + swap_if_less(v_base, 0, 12, is_less); + swap_if_less(v_base, 1, 10, is_less); + swap_if_less(v_base, 2, 9, is_less); + swap_if_less(v_base, 3, 7, is_less); + swap_if_less(v_base, 5, 11, is_less); + swap_if_less(v_base, 6, 8, is_less); + swap_if_less(v_base, 1, 6, is_less); + swap_if_less(v_base, 2, 3, is_less); + swap_if_less(v_base, 4, 11, is_less); + swap_if_less(v_base, 7, 9, is_less); + swap_if_less(v_base, 8, 10, is_less); + swap_if_less(v_base, 0, 4, is_less); + swap_if_less(v_base, 1, 2, is_less); + swap_if_less(v_base, 3, 6, is_less); + swap_if_less(v_base, 7, 8, is_less); + swap_if_less(v_base, 9, 10, is_less); + swap_if_less(v_base, 11, 12, is_less); + swap_if_less(v_base, 4, 6, is_less); + swap_if_less(v_base, 5, 9, is_less); + swap_if_less(v_base, 8, 11, is_less); + swap_if_less(v_base, 10, 12, is_less); + swap_if_less(v_base, 0, 5, is_less); + swap_if_less(v_base, 3, 8, is_less); + swap_if_less(v_base, 4, 7, is_less); + swap_if_less(v_base, 6, 11, is_less); + swap_if_less(v_base, 9, 10, is_less); + swap_if_less(v_base, 0, 1, is_less); + swap_if_less(v_base, 2, 5, is_less); + swap_if_less(v_base, 6, 9, is_less); + swap_if_less(v_base, 7, 8, is_less); + swap_if_less(v_base, 10, 11, is_less); + swap_if_less(v_base, 1, 3, is_less); + swap_if_less(v_base, 2, 4, is_less); + swap_if_less(v_base, 5, 6, is_less); + swap_if_less(v_base, 9, 10, is_less); + swap_if_less(v_base, 1, 2, is_less); + swap_if_less(v_base, 3, 4, is_less); + swap_if_less(v_base, 5, 7, is_less); + swap_if_less(v_base, 6, 8, is_less); + swap_if_less(v_base, 2, 3, is_less); + swap_if_less(v_base, 4, 5, is_less); + swap_if_less(v_base, 6, 7, is_less); + swap_if_less(v_base, 8, 9, is_less); + swap_if_less(v_base, 3, 4, is_less); + swap_if_less(v_base, 5, 6, is_less); + } +} + +/// Sorts range [begin, tail] assuming [begin, tail) is already sorted. +/// +/// # Safety +/// begin < tail and p must be valid and initialized for all begin <= p <= tail. +unsafe fn insert_tail bool>(begin: *mut T, tail: *mut T, is_less: &mut F) { + // SAFETY: see individual comments. + unsafe { + // SAFETY: in-bounds as tail > begin. + let mut sift = tail.sub(1); + if !is_less(&*tail, &*sift) { + return; + } + + // SAFETY: after this read tail is never read from again, as we only ever + // read from sift, sift < tail and we only ever decrease sift. Thus this is + // effectively a move, not a copy. Should a panic occur, or we have found + // the correct insertion position, gap_guard ensures the element is moved + // back into the array. + let tmp = ManuallyDrop::new(tail.read()); + let mut gap_guard = CopyOnDrop { src: &*tmp, dst: tail, len: 1 }; + + loop { + // SAFETY: we move sift into the gap (which is valid), and point the + // gap guard destination at sift, ensuring that if a panic occurs the + // gap is once again filled. + ptr::copy_nonoverlapping(sift, gap_guard.dst, 1); + gap_guard.dst = sift; + + if sift == begin { + break; + } + + // SAFETY: we checked that sift != begin, thus this is in-bounds. + sift = sift.sub(1); + if !is_less(&tmp, &*sift) { + break; + } + } + } +} + +/// Sort `v` assuming `v[..offset]` is already sorted. +pub fn insertion_sort_shift_left bool>( + v: &mut [T], + offset: usize, + is_less: &mut F, +) { + let len = v.len(); + if offset == 0 || offset > len { + intrinsics::abort(); + } + + // SAFETY: see individual comments. + unsafe { + // We write this basic loop directly using pointers, as when we use a + // for loop LLVM likes to unroll this loop which we do not want. + // SAFETY: v_end is the one-past-end pointer, and we checked that + // offset <= len, thus tail is also in-bounds. + let v_base = v.as_mut_ptr(); + let v_end = v_base.add(len); + let mut tail = v_base.add(offset); + while tail != v_end { + // SAFETY: v_base and tail are both valid pointers to elements, and + // v_base < tail since we checked offset != 0. + insert_tail(v_base, tail, is_less); + + // SAFETY: we checked that tail is not yet the one-past-end pointer. + tail = tail.add(1); + } + } +} + +/// SAFETY: The caller MUST guarantee that `v_base` is valid for 4 reads and +/// `dst` is valid for 4 writes. The result will be stored in `dst[0..4]`. +pub unsafe fn sort4_stable bool>( + v_base: *const T, + dst: *mut T, + is_less: &mut F, +) { + // By limiting select to picking pointers, we are guaranteed good cmov code-gen + // regardless of type T's size. Further this only does 5 instead of 6 + // comparisons compared to a stable transposition 4 element sorting-network, + // and always copies each element exactly once. + + // SAFETY: all pointers have offset at most 3 from v_base and dst, and are + // thus in-bounds by the precondition. + unsafe { + // Stably create two pairs a <= b and c <= d. + let c1 = is_less(&*v_base.add(1), &*v_base); + let c2 = is_less(&*v_base.add(3), &*v_base.add(2)); + let a = v_base.add(c1 as usize); + let b = v_base.add(!c1 as usize); + let c = v_base.add(2 + c2 as usize); + let d = v_base.add(2 + (!c2 as usize)); + + // Compare (a, c) and (b, d) to identify max/min. We're left with two + // unknown elements, but because we are a stable sort we must know which + // one is leftmost and which one is rightmost. + // c3, c4 | min max unknown_left unknown_right + // 0, 0 | a d b c + // 0, 1 | a b c d + // 1, 0 | c d a b + // 1, 1 | c b a d + let c3 = is_less(&*c, &*a); + let c4 = is_less(&*d, &*b); + let min = select(c3, c, a); + let max = select(c4, b, d); + let unknown_left = select(c3, a, select(c4, c, b)); + let unknown_right = select(c4, d, select(c3, b, c)); + + // Sort the last two unknown elements. + let c5 = is_less(&*unknown_right, &*unknown_left); + let lo = select(c5, unknown_right, unknown_left); + let hi = select(c5, unknown_left, unknown_right); + + ptr::copy_nonoverlapping(min, dst, 1); + ptr::copy_nonoverlapping(lo, dst.add(1), 1); + ptr::copy_nonoverlapping(hi, dst.add(2), 1); + ptr::copy_nonoverlapping(max, dst.add(3), 1); + } + + #[inline(always)] + fn select(cond: bool, if_true: *const T, if_false: *const T) -> *const T { + if cond { if_true } else { if_false } + } +} + +/// SAFETY: The caller MUST guarantee that `v_base` is valid for 8 reads and +/// writes, `scratch_base` and `dst` MUST be valid for 8 writes. The result will +/// be stored in `dst[0..8]`. +unsafe fn sort8_stable bool>( + v_base: *mut T, + dst: *mut T, + scratch_base: *mut T, + is_less: &mut F, +) { + // SAFETY: these pointers are all in-bounds by the precondition of our function. + unsafe { + sort4_stable(v_base, scratch_base, is_less); + sort4_stable(v_base.add(4), scratch_base.add(4), is_less); + } + + // SAFETY: scratch_base[0..8] is now initialized, allowing us to merge back + // into dst. + unsafe { + bidirectional_merge(&*ptr::slice_from_raw_parts(scratch_base, 8), dst, is_less); + } +} + +#[inline(always)] +unsafe fn merge_up bool>( + mut left_src: *const T, + mut right_src: *const T, + mut dst: *mut T, + is_less: &mut F, +) -> (*const T, *const T, *mut T) { + // This is a branchless merge utility function. + // The equivalent code with a branch would be: + // + // if !is_less(&*right_src, &*left_src) { + // ptr::copy_nonoverlapping(left_src, dst, 1); + // left_src = left_src.add(1); + // } else { + // ptr::copy_nonoverlapping(right_src, dst, 1); + // right_src = right_src.add(1); + // } + // dst = dst.add(1); + + // SAFETY: The caller must guarantee that `left_src`, `right_src` are valid + // to read and `dst` is valid to write, while not aliasing. + unsafe { + let is_l = !is_less(&*right_src, &*left_src); + let src = if is_l { left_src } else { right_src }; + ptr::copy_nonoverlapping(src, dst, 1); + right_src = right_src.add(!is_l as usize); + left_src = left_src.add(is_l as usize); + dst = dst.add(1); + } + + (left_src, right_src, dst) +} + +#[inline(always)] +unsafe fn merge_down bool>( + mut left_src: *const T, + mut right_src: *const T, + mut dst: *mut T, + is_less: &mut F, +) -> (*const T, *const T, *mut T) { + // This is a branchless merge utility function. + // The equivalent code with a branch would be: + // + // if !is_less(&*right_src, &*left_src) { + // ptr::copy_nonoverlapping(right_src, dst, 1); + // right_src = right_src.wrapping_sub(1); + // } else { + // ptr::copy_nonoverlapping(left_src, dst, 1); + // left_src = left_src.wrapping_sub(1); + // } + // dst = dst.sub(1); + + // SAFETY: The caller must guarantee that `left_src`, `right_src` are valid + // to read and `dst` is valid to write, while not aliasing. + unsafe { + let is_l = !is_less(&*right_src, &*left_src); + let src = if is_l { right_src } else { left_src }; + ptr::copy_nonoverlapping(src, dst, 1); + right_src = right_src.wrapping_sub(is_l as usize); + left_src = left_src.wrapping_sub(!is_l as usize); + dst = dst.sub(1); + } + + (left_src, right_src, dst) +} + +/// Merge v assuming v[..len / 2] and v[len / 2..] are sorted. +/// +/// Original idea for bi-directional merging by Igor van den Hoven (quadsort), +/// adapted to only use merge up and down. In contrast to the original +/// parity_merge function, it performs 2 writes instead of 4 per iteration. +/// +/// # Safety +/// The caller must guarantee that `dst` is valid for v.len() writes. +/// Also `v.as_ptr()` and `dst` must not alias and v.len() must be >= 2. +/// +/// Note that T must be Freeze, the comparison function is evaluated on outdated +/// temporary 'copies' that may not end up in the final array. +unsafe fn bidirectional_merge bool>( + v: &[T], + dst: *mut T, + is_less: &mut F, +) { + // It helps to visualize the merge: + // + // Initial: + // + // |dst (in dst) + // |left |right + // v v + // [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx] + // ^ ^ + // |left_rev |right_rev + // |dst_rev (in dst) + // + // After: + // + // |dst (in dst) + // |left | |right + // v v v + // [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx] + // ^ ^ ^ + // |left_rev | |right_rev + // |dst_rev (in dst) + // + // In each iteration one of left or right moves up one position, and one of + // left_rev or right_rev moves down one position, whereas dst always moves + // up one position and dst_rev always moves down one position. Assuming + // the input was sorted and the comparison function is correctly implemented + // at the end we will have left == left_rev + 1, and right == right_rev + 1, + // fully consuming the input having written it to dst. + + let len = v.len(); + let src = v.as_ptr(); + + let len_div_2 = len / 2; + + // SAFETY: The caller has to ensure that len >= 2. + unsafe { + intrinsics::assume(len_div_2 != 0); // This can avoid useless code-gen. + } + + // SAFETY: no matter what the result of the user-provided comparison function + // is, all 4 read pointers will always be in-bounds. Writing `dst` and `dst_rev` + // will always be in bounds if the caller guarantees that `dst` is valid for + // `v.len()` writes. + unsafe { + let mut left = src; + let mut right = src.add(len_div_2); + let mut dst = dst; + + let mut left_rev = src.add(len_div_2 - 1); + let mut right_rev = src.add(len - 1); + let mut dst_rev = dst.add(len - 1); + + for _ in 0..len_div_2 { + (left, right, dst) = merge_up(left, right, dst, is_less); + (left_rev, right_rev, dst_rev) = merge_down(left_rev, right_rev, dst_rev, is_less); + } + + let left_end = left_rev.wrapping_add(1); + let right_end = right_rev.wrapping_add(1); + + // Odd length, so one element is left unconsumed in the input. + if len % 2 != 0 { + let left_nonempty = left < left_end; + let last_src = if left_nonempty { left } else { right }; + ptr::copy_nonoverlapping(last_src, dst, 1); + left = left.add(left_nonempty as usize); + right = right.add((!left_nonempty) as usize); + } + + // We now should have consumed the full input exactly once. This can + // only fail if the comparison operator fails to be Ord, in which case + // we will panic and never access the inconsistent state in dst. + if left != left_end || right != right_end { + panic_on_ord_violation(); + } + } +} + +#[inline(never)] +fn panic_on_ord_violation() -> ! { + panic!("Ord violation"); +} + +#[must_use] +pub(crate) const fn has_efficient_in_place_swap() -> bool { + // Heuristic that holds true on all tested 64-bit capable architectures. + mem::size_of::() <= 8 // mem::size_of::() +} + +#[test] +fn type_info() { + assert!(has_efficient_in_place_swap::()); + assert!(has_efficient_in_place_swap::()); + assert!(!has_efficient_in_place_swap::()); + assert!(!has_efficient_in_place_swap::()); +} + +/// SAFETY: Only used for run-time optimization heuristic. +#[rustc_unsafe_specialization_marker] +trait CopyMarker {} + +impl CopyMarker for T {} + +#[const_trait] +trait IsCopy { + fn is_copy() -> bool; +} + +impl const IsCopy for T { + default fn is_copy() -> bool { + false + } +} +impl const IsCopy for T { + fn is_copy() -> bool { + true + } +} diff --git a/library/core/src/slice/sort/stable/drift.rs b/library/core/src/slice/sort/stable/drift.rs new file mode 100644 index 0000000000000..4008639095bde --- /dev/null +++ b/library/core/src/slice/sort/stable/drift.rs @@ -0,0 +1,300 @@ +//! This module contains the hybrid top-level loop combining bottom-up Mergesort with top-down +//! Quicksort. + +use crate::cmp; +use crate::intrinsics; +use crate::mem::MaybeUninit; + +use crate::slice::sort::shared::find_existing_run; +use crate::slice::sort::shared::smallsort::StableSmallSortTypeImpl; +use crate::slice::sort::stable::merge::merge; +use crate::slice::sort::stable::quicksort::quicksort; + +/// Sorts `v` based on comparison function `is_less`. If `eager_sort` is true, +/// it will only do small-sorts and physical merges, ensuring O(N * log(N)) +/// worst-case complexity. `scratch.len()` must be at least `max(v.len() / 2, +/// MIN_SMALL_SORT_SCRATCH_LEN)` otherwise the implementation may abort. +/// Fully ascending and descending inputs will be sorted with exactly N - 1 +/// comparisons. +/// +/// This is the main loop for driftsort, which uses powersort's heuristic to +/// determine in which order to merge runs, see below for details. +pub fn sort bool>( + v: &mut [T], + scratch: &mut [MaybeUninit], + eager_sort: bool, + is_less: &mut F, +) { + let len = v.len(); + if len < 2 { + return; // Removing this length check *increases* code size. + } + let scale_factor = merge_tree_scale_factor(len); + + // It's important to have a relatively high entry barrier for pre-sorted + // runs, as the presence of a single such run will force on average several + // merge operations and shrink the maximum quicksort size a lot. For that + // reason we use sqrt(len) as our pre-sorted run threshold. + const MIN_SQRT_RUN_LEN: usize = 64; + let min_good_run_len = if len <= (MIN_SQRT_RUN_LEN * MIN_SQRT_RUN_LEN) { + // For small input length `MIN_SQRT_RUN_LEN` would break pattern + // detection of full or nearly sorted inputs. + cmp::min(len - len / 2, MIN_SQRT_RUN_LEN) + } else { + sqrt_approx(len) + }; + + // (stack_len, runs, desired_depths) together form a stack maintaining run + // information for the powersort heuristic. desired_depths[i] is the desired + // depth of the merge node that merges runs[i] with the run that comes after + // it. + let mut stack_len = 0; + let mut run_storage = MaybeUninit::<[DriftsortRun; 66]>::uninit(); + let runs: *mut DriftsortRun = run_storage.as_mut_ptr().cast(); + let mut desired_depth_storage = MaybeUninit::<[u8; 66]>::uninit(); + let desired_depths: *mut u8 = desired_depth_storage.as_mut_ptr().cast(); + + let mut scan_idx = 0; + let mut prev_run = DriftsortRun::new_sorted(0); // Initial dummy run. + loop { + // Compute the next run and the desired depth of the merge node between + // prev_run and next_run. On the last iteration we create a dummy run + // with root-level desired depth to fully collapse the merge tree. + let (next_run, desired_depth); + if scan_idx < len { + next_run = + create_run(&mut v[scan_idx..], scratch, min_good_run_len, eager_sort, is_less); + desired_depth = merge_tree_depth( + scan_idx - prev_run.len(), + scan_idx, + scan_idx + next_run.len(), + scale_factor, + ); + } else { + next_run = DriftsortRun::new_sorted(0); + desired_depth = 0; + }; + + // Process the merge nodes between earlier runs[i] that have a desire to + // be deeper in the merge tree than the merge node for the splitpoint + // between prev_run and next_run. + // + // SAFETY: first note that this is the only place we modify stack_len, + // runs or desired depths. We maintain the following invariants: + // 1. The first stack_len elements of runs/desired_depths are initialized. + // 2. For all valid i > 0, desired_depths[i] < desired_depths[i+1]. + // 3. The sum of all valid runs[i].len() plus prev_run.len() equals + // scan_idx. + unsafe { + while stack_len > 1 && *desired_depths.add(stack_len - 1) >= desired_depth { + // Desired depth greater than the upcoming desired depth, pop + // left neighbor run from stack and merge into prev_run. + let left = *runs.add(stack_len - 1); + let merged_len = left.len() + prev_run.len(); + let merge_start_idx = scan_idx - merged_len; + let merge_slice = v.get_unchecked_mut(merge_start_idx..scan_idx); + prev_run = logical_merge(merge_slice, scratch, left, prev_run, is_less); + stack_len -= 1; + } + + // We now know that desired_depths[stack_len - 1] < desired_depth, + // maintaining our invariant. This also guarantees we don't overflow + // the stack as merge_tree_depth(..) <= 64 and thus we can only have + // 64 distinct values on the stack before pushing, plus our initial + // dummy run, while our capacity is 66. + *runs.add(stack_len) = prev_run; + *desired_depths.add(stack_len) = desired_depth; + stack_len += 1; + } + + // Break before overriding the last run with our dummy run. + if scan_idx >= len { + break; + } + + scan_idx += next_run.len(); + prev_run = next_run; + } + + if !prev_run.sorted() { + stable_quicksort(v, scratch, is_less); + } +} + +// Nearly-Optimal Mergesorts: Fast, Practical Sorting Methods That Optimally +// Adapt to Existing Runs by J. Ian Munro and Sebastian Wild. +// +// This method forms a binary merge tree, where each internal node corresponds +// to a splitting point between the adjacent runs that have to be merged. If we +// visualize our array as the number line from 0 to 1, we want to find the +// dyadic fraction with smallest denominator that lies between the midpoints of +// our to-be-merged slices. The exponent in the dyadic fraction indicates the +// desired depth in the binary merge tree this internal node wishes to have. +// This does not always correspond to the actual depth due to the inherent +// imbalance in runs, but we follow it as closely as possible. +// +// As an optimization we rescale the number line from [0, 1) to [0, 2^62). Then +// finding the simplest dyadic fraction between midpoints corresponds to finding +// the most significant bit difference of the midpoints. We save scale_factor = +// ceil(2^62 / n) to perform this rescaling using a multiplication, avoiding +// having to repeatedly do integer divides. This rescaling isn't exact when n is +// not a power of two since we use integers and not reals, but the result is +// very close, and in fact when n < 2^30 the resulting tree is equivalent as the +// approximation errors stay entirely in the lower order bits. +// +// Thus for the splitting point between two adjacent slices [a, b) and [b, c) +// the desired depth of the corresponding merge node is CLZ((a+b)*f ^ (b+c)*f), +// where CLZ counts the number of leading zeros in an integer and f is our scale +// factor. Note that we omitted the division by two in the midpoint +// calculations, as this simply shifts the bits by one position (and thus always +// adds one to the result), and we only care about the relative depths. +// +// Finally, if we try to upper bound x = (a+b)*f giving x = (n-1 + n) * ceil(2^62 / n) then +// x < (2^62 / n + 1) * 2n +// x < 2^63 + 2n +// So as long as n < 2^62 we find that x < 2^64, meaning our operations do not +// overflow. +#[inline(always)] +fn merge_tree_scale_factor(n: usize) -> u64 { + if usize::BITS > u64::BITS { + panic!("Platform not supported"); + } + + ((1 << 62) + n as u64 - 1) / n as u64 +} + +// Note: merge_tree_depth output is < 64 when left < right as f*x and f*y must +// differ in some bit, and is <= 64 always. +#[inline(always)] +fn merge_tree_depth(left: usize, mid: usize, right: usize, scale_factor: u64) -> u8 { + let x = left as u64 + mid as u64; + let y = mid as u64 + right as u64; + ((scale_factor * x) ^ (scale_factor * y)).leading_zeros() as u8 +} + +fn sqrt_approx(n: usize) -> usize { + // Note that sqrt(n) = n^(1/2), and that 2^log2(n) = n. We combine these + // two facts to approximate sqrt(n) as 2^(log2(n) / 2). Because our integer + // log floors we want to add 0.5 to compensate for this on average, so our + // initial approximation is 2^((1 + floor(log2(n))) / 2). + // + // We then apply an iteration of Newton's method to improve our + // approximation, which for sqrt(n) is a1 = (a0 + n / a0) / 2. + // + // Finally we note that the exponentiation / division can be done directly + // with shifts. We OR with 1 to avoid zero-checks in the integer log. + let ilog = (n | 1).ilog2(); + let shift = (1 + ilog) / 2; + ((1 << shift) + (n >> shift)) / 2 +} + +// Lazy logical runs as in Glidesort. +#[inline(always)] +fn logical_merge bool>( + v: &mut [T], + scratch: &mut [MaybeUninit], + left: DriftsortRun, + right: DriftsortRun, + is_less: &mut F, +) -> DriftsortRun { + // If one or both of the runs are sorted do a physical merge, using + // quicksort to sort the unsorted run if present. We also *need* to + // physically merge if the combined runs would not fit in the scratch space + // anymore (as this would mean we are no longer able to to quicksort them). + let len = v.len(); + let can_fit_in_scratch = len <= scratch.len(); + if !can_fit_in_scratch || left.sorted() || right.sorted() { + if !left.sorted() { + stable_quicksort(&mut v[..left.len()], scratch, is_less); + } + if !right.sorted() { + stable_quicksort(&mut v[left.len()..], scratch, is_less); + } + merge(v, scratch, left.len(), is_less); + + DriftsortRun::new_sorted(len) + } else { + DriftsortRun::new_unsorted(len) + } +} + +/// Creates a new logical run. +/// +/// A logical run can either be sorted or unsorted. If there is a pre-existing +/// run that clears the `min_good_run_len` threshold it is returned as a sorted +/// run. If not, the result depends on the value of `eager_sort`. If it is true, +/// then a sorted run of length `T::SMALL_SORT_THRESHOLD` is returned, and if it +/// is false an unsorted run of length `min_good_run_len` is returned. +fn create_run bool>( + v: &mut [T], + scratch: &mut [MaybeUninit], + min_good_run_len: usize, + eager_sort: bool, + is_less: &mut F, +) -> DriftsortRun { + let len = v.len(); + if len >= min_good_run_len { + let (run_len, was_reversed) = find_existing_run(v, is_less); + + // SAFETY: find_existing_run promises to return a valid run_len. + unsafe { intrinsics::assume(run_len <= len) }; + + if run_len >= min_good_run_len { + if was_reversed { + v[..run_len].reverse(); + } + + return DriftsortRun::new_sorted(run_len); + } + } + + if eager_sort { + // We call quicksort with a len that will immediately call small-sort. + // By not calling the small-sort directly here it can always be inlined into + // the quicksort itself, making the recursive base case faster and is generally + // more binary-size efficient. + let eager_run_len = cmp::min(T::small_sort_threshold(), len); + quicksort(&mut v[..eager_run_len], scratch, 0, None, is_less); + DriftsortRun::new_sorted(eager_run_len) + } else { + DriftsortRun::new_unsorted(cmp::min(min_good_run_len, len)) + } +} + +fn stable_quicksort bool>( + v: &mut [T], + scratch: &mut [MaybeUninit], + is_less: &mut F, +) { + // Limit the number of imbalanced partitions to `2 * floor(log2(len))`. + // The binary OR by one is used to eliminate the zero-check in the logarithm. + let limit = 2 * (v.len() | 1).ilog2(); + quicksort(v, scratch, limit, None, is_less); +} + +/// Compactly stores the length of a run, and whether or not it is sorted. This +/// can always fit in a usize because the maximum slice length is isize::MAX. +#[derive(Copy, Clone)] +struct DriftsortRun(usize); + +impl DriftsortRun { + #[inline(always)] + fn new_sorted(length: usize) -> Self { + Self((length << 1) | 1) + } + + #[inline(always)] + fn new_unsorted(length: usize) -> Self { + Self(length << 1) + } + + #[inline(always)] + fn sorted(self) -> bool { + self.0 & 1 == 1 + } + + #[inline(always)] + fn len(self) -> usize { + self.0 >> 1 + } +} diff --git a/library/core/src/slice/sort/stable/merge.rs b/library/core/src/slice/sort/stable/merge.rs new file mode 100644 index 0000000000000..4f6a98bc7b89f --- /dev/null +++ b/library/core/src/slice/sort/stable/merge.rs @@ -0,0 +1,151 @@ +//! This module contains logic for performing a merge of two sorted sub-slices. + +use crate::cmp; +use crate::mem::MaybeUninit; +use crate::ptr; + +/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `scratch` as +/// temporary storage, and stores the result into `v[..]`. +pub fn merge bool>( + v: &mut [T], + scratch: &mut [MaybeUninit], + mid: usize, + is_less: &mut F, +) { + let len = v.len(); + + if mid == 0 || mid >= len || scratch.len() < cmp::min(mid, len - mid) { + return; + } + + // SAFETY: We checked that the two slices are non-empty and `mid` is in-bounds. + // We checked that the buffer `scratch` has enough capacity to hold a copy of + // the shorter slice. `merge_up` and `merge_down` are written in such a way that + // they uphold the contract described in `MergeState::drop`. + unsafe { + // The merge process first copies the shorter run into `buf`. Then it traces + // the newly copied run and the longer run forwards (or backwards), comparing + // their next unconsumed elements and copying the lesser (or greater) one into `v`. + // + // As soon as the shorter run is fully consumed, the process is done. If the + // longer run gets consumed first, then we must copy whatever is left of the + // shorter run into the remaining gap in `v`. + // + // Intermediate state of the process is always tracked by `gap`, which serves + // two purposes: + // 1. Protects integrity of `v` from panics in `is_less`. + // 2. Fills the remaining gap in `v` if the longer run gets consumed first. + + let buf = MaybeUninit::slice_as_mut_ptr(scratch); + + let v_base = v.as_mut_ptr(); + let v_mid = v_base.add(mid); + let v_end = v_base.add(len); + + let left_len = mid; + let right_len = len - mid; + + let left_is_shorter = left_len <= right_len; + let save_base = if left_is_shorter { v_base } else { v_mid }; + let save_len = if left_is_shorter { left_len } else { right_len }; + + ptr::copy_nonoverlapping(save_base, buf, save_len); + + let mut merge_state = MergeState { start: buf, end: buf.add(save_len), dst: save_base }; + + if left_is_shorter { + merge_state.merge_up(v_mid, v_end, is_less); + } else { + merge_state.merge_down(v_base, buf, v_end, is_less); + } + // Finally, `merge_state` gets dropped. If the shorter run was not fully + // consumed, whatever remains of it will now be copied into the hole in `v`. + } + + // When dropped, copies the range `start..end` into `dst..`. + struct MergeState { + start: *mut T, + end: *mut T, + dst: *mut T, + } + + impl MergeState { + /// # Safety + /// The caller MUST guarantee that `self` is initialized in a way where `start -> end` is + /// the longer sub-slice and so that `dst` can be written to at least the shorter sub-slice + /// length times. In addition `start -> end` and `right -> right_end` MUST be valid to be + /// read. This function MUST only be called once. + unsafe fn merge_up bool>( + &mut self, + mut right: *const T, + right_end: *const T, + is_less: &mut F, + ) { + // SAFETY: See function safety comment. + unsafe { + let left = &mut self.start; + let out = &mut self.dst; + + while *left != self.end && right as *const T != right_end { + let consume_left = !is_less(&*right, &**left); + + let src = if consume_left { *left } else { right }; + ptr::copy_nonoverlapping(src, *out, 1); + + *left = left.add(consume_left as usize); + right = right.add(!consume_left as usize); + + *out = out.add(1); + } + } + } + + /// # Safety + /// The caller MUST guarantee that `self` is initialized in a way where `left_end <- dst` is + /// the shorter sub-slice and so that `out` can be written to at least the shorter sub-slice + /// length times. In addition `left_end <- dst` and `right_end <- end` MUST be valid to be + /// read. This function MUST only be called once. + unsafe fn merge_down bool>( + &mut self, + left_end: *const T, + right_end: *const T, + mut out: *mut T, + is_less: &mut F, + ) { + // SAFETY: See function safety comment. + unsafe { + loop { + let left = self.dst.sub(1); + let right = self.end.sub(1); + out = out.sub(1); + + let consume_left = is_less(&*right, &*left); + + let src = if consume_left { left } else { right }; + ptr::copy_nonoverlapping(src, out, 1); + + self.dst = left.add(!consume_left as usize); + self.end = right.add(consume_left as usize); + + if self.dst as *const T == left_end || self.end as *const T == right_end { + break; + } + } + } + } + } + + impl Drop for MergeState { + fn drop(&mut self) { + // SAFETY: The user of MergeState MUST ensure, that at any point this drop + // impl MAY run, for example when the user provided `is_less` panics, that + // copying the contiguous region between `start` and `end` to `dst` will + // leave the input slice `v` with each original element and all possible + // modifications observed. + unsafe { + let len = self.end.sub_ptr(self.start); + ptr::copy_nonoverlapping(self.start, self.dst, len); + } + } + } +} diff --git a/library/core/src/slice/sort/stable/mod.rs b/library/core/src/slice/sort/stable/mod.rs new file mode 100644 index 0000000000000..f9034e1b4d8ef --- /dev/null +++ b/library/core/src/slice/sort/stable/mod.rs @@ -0,0 +1,116 @@ +//! This module contains the entry points for `slice::sort`. + +use crate::cmp; +use crate::intrinsics; +use crate::mem::{self, MaybeUninit, SizedTypeProperties}; + +use crate::slice::sort::shared::smallsort::{ + insertion_sort_shift_left, StableSmallSortTypeImpl, SMALL_SORT_GENERAL_SCRATCH_LEN, +}; + +pub(crate) mod drift; +pub(crate) mod merge; +pub(crate) mod quicksort; + +/// Stable sort called driftsort by Orson Peters and Lukas Bergdoll. +/// Design document: +/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md +/// +/// Upholds all safety properties outlined here: +/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md +#[inline(always)] +pub fn sort bool, BufT: BufGuard>(v: &mut [T], is_less: &mut F) { + // Arrays of zero-sized types are always all-equal, and thus sorted. + if T::IS_ZST { + return; + } + + // Instrumenting the standard library showed that 90+% of the calls to sort + // by rustc are either of size 0 or 1. + let len = v.len(); + if intrinsics::likely(len < 2) { + return; + } + + // More advanced sorting methods than insertion sort are faster if called in + // a hot loop for small inputs, but for general-purpose code the small + // binary size of insertion sort is more important. The instruction cache in + // modern processors is very valuable, and for a single sort call in general + // purpose code any gains from an advanced method are cancelled by i-cache + // misses during the sort, and thrashing the i-cache for surrounding code. + const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; + if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { + insertion_sort_shift_left(v, 1, is_less); + return; + } + + driftsort_main::(v, is_less); +} + +/// See [`sort`] +/// +/// Deliberately don't inline the main sorting routine entrypoint to ensure the +/// inlined insertion sort i-cache footprint remains minimal. +#[inline(never)] +fn driftsort_main bool, BufT: BufGuard>(v: &mut [T], is_less: &mut F) { + // By allocating n elements of memory we can ensure the entire input can + // be sorted using stable quicksort, which allows better performance on + // random and low-cardinality distributions. However, we still want to + // reduce our memory usage to n / 2 for large inputs. We do this by scaling + // our allocation as max(n / 2, min(n, 8MB)), ensuring we scale like n for + // small inputs and n / 2 for large inputs, without a sudden drop off. We + // also need to ensure our alloc >= MIN_SMALL_SORT_SCRATCH_LEN, as the + // small-sort always needs this much memory. + const MAX_FULL_ALLOC_BYTES: usize = 8_000_000; // 8MB + let max_full_alloc = MAX_FULL_ALLOC_BYTES / mem::size_of::(); + let len = v.len(); + let alloc_len = + cmp::max(cmp::max(len / 2, cmp::min(len, max_full_alloc)), SMALL_SORT_GENERAL_SCRATCH_LEN); + + // For small inputs 4KiB of stack storage suffices, which allows us to avoid + // calling the (de-)allocator. Benchmarks showed this was quite beneficial. + let mut stack_buf = AlignedStorage::::new(); + let stack_scratch = stack_buf.as_uninit_slice_mut(); + let mut heap_buf; + let scratch = if stack_scratch.len() >= alloc_len { + stack_scratch + } else { + heap_buf = BufT::with_capacity(alloc_len); + heap_buf.as_uninit_slice_mut() + }; + + // For small inputs using quicksort is not yet beneficial, and a single + // small-sort or two small-sorts plus a single merge outperforms it, so use + // eager mode. + let eager_sort = len <= T::small_sort_threshold() * 2; + crate::slice::sort::stable::drift::sort(v, scratch, eager_sort, is_less); +} + +#[doc(hidden)] +/// Abstracts owned memory buffer, so that sort code can live in core where no allocation is +/// possible. This trait can then be implemented in a place that has access to allocation. +pub trait BufGuard { + /// Creates new buffer that holds at least `capacity` memory. + fn with_capacity(capacity: usize) -> Self; + /// Returns mutable access to uninitialized memory owned by the buffer. + fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit]; +} + +#[repr(C)] +struct AlignedStorage { + _align: [T; 0], + storage: [MaybeUninit; N], +} + +impl AlignedStorage { + fn new() -> Self { + Self { _align: [], storage: MaybeUninit::uninit_array() } + } + + fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit] { + let len = N / mem::size_of::(); + + // SAFETY: `_align` ensures we are correctly aligned. + unsafe { core::slice::from_raw_parts_mut(self.storage.as_mut_ptr().cast(), len) } + } +} diff --git a/library/core/src/slice/sort/stable/quicksort.rs b/library/core/src/slice/sort/stable/quicksort.rs new file mode 100644 index 0000000000000..b6d6d3ec8ea73 --- /dev/null +++ b/library/core/src/slice/sort/stable/quicksort.rs @@ -0,0 +1,267 @@ +//! This module contains a stable quicksort and partition implementation. + +use crate::intrinsics; +use crate::mem::{self, ManuallyDrop, MaybeUninit}; +use crate::ptr; + +use crate::slice::sort::shared::pivot::choose_pivot; +use crate::slice::sort::shared::smallsort::StableSmallSortTypeImpl; +use crate::slice::sort::shared::FreezeMarker; + +/// Sorts `v` recursively using quicksort. +/// +/// `limit` when initialized with `c*log(v.len())` for some c ensures we do not +/// overflow the stack or go quadratic. +#[inline(never)] +pub fn quicksort bool>( + mut v: &mut [T], + scratch: &mut [MaybeUninit], + mut limit: u32, + mut left_ancestor_pivot: Option<&T>, + is_less: &mut F, +) { + loop { + let len = v.len(); + + if len <= T::small_sort_threshold() { + T::small_sort(v, scratch, is_less); + return; + } + + if limit == 0 { + // We have had too many bad pivots, switch to O(n log n) fallback + // algorithm. In our case that is driftsort in eager mode. + crate::slice::sort::stable::drift::sort(v, scratch, true, is_less); + return; + } + limit -= 1; + + let pivot_pos = choose_pivot(v, is_less); + // SAFETY: choose_pivot promises to return a valid pivot index. + unsafe { + intrinsics::assume(pivot_pos < v.len()); + } + + // SAFETY: We only access the temporary copy for Freeze types, otherwise + // self-modifications via `is_less` would not be observed and this would + // be unsound. Our temporary copy does not escape this scope. + let pivot_copy = unsafe { ManuallyDrop::new(ptr::read(&v[pivot_pos])) }; + let pivot_ref = (!has_direct_interior_mutability::()).then_some(&*pivot_copy); + + // We choose a pivot, and check if this pivot is equal to our left + // ancestor. If true, we do a partition putting equal elements on the + // left and do not recurse on it. This gives O(n log k) sorting for k + // distinct values, a strategy borrowed from pdqsort. For types with + // interior mutability we can't soundly create a temporary copy of the + // ancestor pivot, and use left_partition_len == 0 as our method for + // detecting when we re-use a pivot, which means we do at most three + // partition operations with pivot p instead of the optimal two. + let mut perform_equal_partition = false; + if let Some(la_pivot) = left_ancestor_pivot { + perform_equal_partition = !is_less(la_pivot, &v[pivot_pos]); + } + + let mut left_partition_len = 0; + if !perform_equal_partition { + left_partition_len = stable_partition(v, scratch, pivot_pos, false, is_less); + perform_equal_partition = left_partition_len == 0; + } + + if perform_equal_partition { + let mid_eq = stable_partition(v, scratch, pivot_pos, true, &mut |a, b| !is_less(b, a)); + v = &mut v[mid_eq..]; + left_ancestor_pivot = None; + continue; + } + + // Process left side with the next loop iter, right side with recursion. + let (left, right) = v.split_at_mut(left_partition_len); + quicksort(right, scratch, limit, pivot_ref, is_less); + v = left; + } +} + +/// Partitions `v` using pivot `p = v[pivot_pos]` and returns the number of +/// elements less than `p`. The relative order of elements that compare < p and +/// those that compare >= p is preserved - it is a stable partition. +/// +/// If `is_less` is not a strict total order or panics, `scratch.len() < v.len()`, +/// or `pivot_pos >= v.len()`, the result and `v`'s state is sound but unspecified. +fn stable_partition bool>( + v: &mut [T], + scratch: &mut [MaybeUninit], + pivot_pos: usize, + pivot_goes_left: bool, + is_less: &mut F, +) -> usize { + let len = v.len(); + + if intrinsics::unlikely(scratch.len() < len || pivot_pos >= len) { + core::intrinsics::abort() + } + + let v_base = v.as_ptr(); + let scratch_base = MaybeUninit::slice_as_mut_ptr(scratch); + + // The core idea is to write the values that compare as less-than to the left + // side of `scratch`, while the values that compared as greater or equal than + // `v[pivot_pos]` go to the right side of `scratch` in reverse. See + // PartitionState for details. + + // SAFETY: see individual comments. + unsafe { + // SAFETY: we made sure the scratch has length >= len and that pivot_pos + // is in-bounds. v and scratch are disjoint slices. + let pivot = v_base.add(pivot_pos); + let mut state = PartitionState::new(v_base, scratch_base, len); + + let mut pivot_in_scratch = ptr::null_mut(); + let mut loop_end_pos = pivot_pos; + + // SAFETY: this loop is equivalent to calling state.partition_one + // exactly len times. + loop { + // Ideally the outer loop won't be unrolled, to save binary size, + // but we do want the inner loop to be unrolled for small types, as + // this gave significant performance boosts in benchmarks. Unrolling + // through for _ in 0..UNROLL_LEN { .. } instead of manually improves + // compile times but has a ~10-20% performance penalty on opt-level=s. + if const { mem::size_of::() <= 16 } { + const UNROLL_LEN: usize = 4; + let unroll_end = v_base.add(loop_end_pos.saturating_sub(UNROLL_LEN - 1)); + while state.scan < unroll_end { + state.partition_one(is_less(&*state.scan, &*pivot)); + state.partition_one(is_less(&*state.scan, &*pivot)); + state.partition_one(is_less(&*state.scan, &*pivot)); + state.partition_one(is_less(&*state.scan, &*pivot)); + } + } + + let loop_end = v_base.add(loop_end_pos); + while state.scan < loop_end { + state.partition_one(is_less(&*state.scan, &*pivot)); + } + + if loop_end_pos == len { + break; + } + + // We avoid comparing pivot with itself, as this could create deadlocks for + // certain comparison operators. We also store its location later for later. + pivot_in_scratch = state.partition_one(pivot_goes_left); + + loop_end_pos = len; + } + + // `pivot` must be copied into its correct position again, because a + // comparison operator might have modified it. + if has_direct_interior_mutability::() { + ptr::copy_nonoverlapping(pivot, pivot_in_scratch, 1); + } + + // SAFETY: partition_one being called exactly len times guarantees that scratch + // is initialized with a permuted copy of `v`, and that num_left <= v.len(). + // Copying scratch[0..num_left] and scratch[num_left..v.len()] back is thus + // sound, as the values in scratch will never be read again, meaning our copies + // semantically act as moves, permuting `v`. + + // Copy all the elements < p directly from swap to v. + let v_base = v.as_mut_ptr(); + ptr::copy_nonoverlapping(scratch_base, v_base, state.num_left); + + // Copy the elements >= p in reverse order. + for i in 0..len - state.num_left { + ptr::copy_nonoverlapping( + scratch_base.add(len - 1 - i), + v_base.add(state.num_left + i), + 1, + ); + } + + state.num_left + } +} + +struct PartitionState { + // The start of the scratch auxiliary memory. + scratch_base: *mut T, + // The current element that is being looked at, scans left to right through slice. + scan: *const T, + // Counts the number of elements that went to the left side, also works around: + // https://github.com/rust-lang/rust/issues/117128 + num_left: usize, + // Reverse scratch output pointer. + scratch_rev: *mut T, +} + +impl PartitionState { + /// # Safety + /// scan and scratch must point to valid disjoint buffers of length len. The + /// scan buffer must be initialized. + unsafe fn new(scan: *const T, scratch: *mut T, len: usize) -> Self { + // SAFETY: See function safety comment. + unsafe { Self { scratch_base: scratch, scan, num_left: 0, scratch_rev: scratch.add(len) } } + } + + /// Depending on the value of `towards_left` this function will write a value + /// to the growing left or right side of the scratch memory. This forms the + /// branchless core of the partition. + /// + /// # Safety + /// This function may be called at most `len` times. If it is called exactly + /// `len` times the scratch buffer then contains a copy of each element from + /// the scan buffer exactly once - a permutation, and num_left <= len. + unsafe fn partition_one(&mut self, towards_left: bool) -> *mut T { + // SAFETY: see individual comments. + unsafe { + // SAFETY: in-bounds because this function is called at most len times, and thus + // right now is incremented at most len - 1 times. Similarly, num_left < len and + // num_right < len, where num_right == i - num_left at the start of the ith + // iteration (zero-indexed). + self.scratch_rev = self.scratch_rev.sub(1); + + // SAFETY: now we have scratch_rev == base + len - (i + 1). This means + // scratch_rev + num_left == base + len - 1 - num_right < base + len. + let dst_base = if towards_left { self.scratch_base } else { self.scratch_rev }; + let dst = dst_base.add(self.num_left); + ptr::copy_nonoverlapping(self.scan, dst, 1); + + self.num_left += towards_left as usize; + self.scan = self.scan.add(1); + dst + } + } +} + +#[const_trait] +trait IsFreeze { + fn is_freeze() -> bool; +} + +impl const IsFreeze for T { + default fn is_freeze() -> bool { + false + } +} +impl const IsFreeze for T { + fn is_freeze() -> bool { + true + } +} + +#[must_use] +const fn has_direct_interior_mutability() -> bool { + // If a type has interior mutability it may alter itself during comparison + // in a way that must be preserved after the sort operation concludes. + // Otherwise a type like Mutex>> could lead to double free. + !T::is_freeze() +} + +#[test] +fn freeze_check() { + assert!(!has_direct_interior_mutability::()); + assert!(!has_direct_interior_mutability::<[u128; 2]>()); + + assert!(has_direct_interior_mutability::>()); + assert!(has_direct_interior_mutability::>()); +} diff --git a/library/core/src/slice/sort/unstable/heapsort.rs b/library/core/src/slice/sort/unstable/heapsort.rs new file mode 100644 index 0000000000000..559605ef4b6b3 --- /dev/null +++ b/library/core/src/slice/sort/unstable/heapsort.rs @@ -0,0 +1,80 @@ +//! This module contains a branchless heapsort as fallback for unstable quicksort. + +use crate::intrinsics; +use crate::ptr; + +/// Sorts `v` using heapsort, which guarantees *O*(*n* \* log(*n*)) worst-case. +/// +/// Never inline this, it sits the main hot-loop in `recurse` and is meant as unlikely algorithmic +/// fallback. +/// +/// SAFETY: The caller has to guarantee that `v.len()` >= 2. +#[inline(never)] +pub(crate) unsafe fn heapsort(v: &mut [T], is_less: &mut F) +where + F: FnMut(&T, &T) -> bool, +{ + // SAFETY: See function safety. + unsafe { + intrinsics::assume(v.len() >= 2); + + // Build the heap in linear time. + for i in (0..v.len() / 2).rev() { + sift_down(v, i, is_less); + } + + // Pop maximal elements from the heap. + for i in (1..v.len()).rev() { + v.swap(0, i); + sift_down(&mut v[..i], 0, is_less); + } + } +} + +// This binary heap respects the invariant `parent >= child`. +// +// SAFETY: The caller has to guarantee that node < `v.len()`. +#[inline(never)] +unsafe fn sift_down(v: &mut [T], mut node: usize, is_less: &mut F) +where + F: FnMut(&T, &T) -> bool, +{ + // SAFETY: See function safety. + unsafe { + intrinsics::assume(node < v.len()); + } + + let len = v.len(); + + let v_base = v.as_mut_ptr(); + + loop { + // Children of `node`. + let mut child = 2 * node + 1; + if child >= len { + break; + } + + // SAFETY: The invariants and checks guarantee that both node and child are in-bounds. + unsafe { + // Choose the greater child. + if child + 1 < len { + // We need a branch to be sure not to out-of-bounds index, + // but it's highly predictable. The comparison, however, + // is better done branchless, especially for primitives. + child += is_less(&*v_base.add(child), &*v_base.add(child + 1)) as usize; + } + + // Stop if the invariant holds at `node`. + if !is_less(&*v_base.add(node), &*v_base.add(child)) { + break; + } + + // Swap `node` with the greater child, move one step down, and continue sifting. This + // could be ptr::swap_nonoverlapping but that adds a significant amount of binary-size. + ptr::swap(v_base.add(node), v_base.add(child)); + } + + node = child; + } +} diff --git a/library/core/src/slice/sort/unstable/mod.rs b/library/core/src/slice/sort/unstable/mod.rs new file mode 100644 index 0000000000000..0d0fbd74b388e --- /dev/null +++ b/library/core/src/slice/sort/unstable/mod.rs @@ -0,0 +1,76 @@ +//! This module contains the entry points for `slice::sort_unstable`. + +use crate::intrinsics; +use crate::mem::SizedTypeProperties; + +use crate::slice::sort::shared::find_existing_run; +use crate::slice::sort::shared::smallsort::insertion_sort_shift_left; + +pub(crate) mod heapsort; +pub(crate) mod quicksort; + +/// Unstable sort called ipnsort by Lukas Bergdoll. +/// Design document: +/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/ipnsort_introduction/text.md +/// +/// Upholds all safety properties outlined here: +/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md +#[inline(always)] +pub fn sort bool>(v: &mut [T], is_less: &mut F) { + // Arrays of zero-sized types are always all-equal, and thus sorted. + if T::IS_ZST { + return; + } + + // Instrumenting the standard library showed that 90+% of the calls to sort + // by rustc are either of size 0 or 1. + let len = v.len(); + if intrinsics::likely(len < 2) { + return; + } + + // More advanced sorting methods than insertion sort are faster if called in + // a hot loop for small inputs, but for general-purpose code the small + // binary size of insertion sort is more important. The instruction cache in + // modern processors is very valuable, and for a single sort call in general + // purpose code any gains from an advanced method are cancelled by i-cache + // misses during the sort, and thrashing the i-cache for surrounding code. + const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; + if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { + insertion_sort_shift_left(v, 1, is_less); + return; + } + + ipnsort(v, is_less); +} + +/// See [`sort`] +/// +/// Deliberately don't inline the main sorting routine entrypoint to ensure the +/// inlined insertion sort i-cache footprint remains minimal. +#[inline(never)] +fn ipnsort(v: &mut [T], is_less: &mut F) +where + F: FnMut(&T, &T) -> bool, +{ + let len = v.len(); + let (run_len, was_reversed) = find_existing_run(v, is_less); + + // SAFETY: find_existing_run promises to return a valid run_len. + unsafe { intrinsics::assume(run_len <= len) }; + + if run_len == len { + if was_reversed { + v.reverse(); + } + + // It would be possible to a do in-place merging here for a long existing streak. But that + // makes the implementation a lot bigger, users can use `slice::sort` for that use-case. + return; + } + + // Limit the number of imbalanced partitions to `2 * floor(log2(len))`. + // The binary OR by one is used to eliminate the zero-check in the logarithm. + let limit = 2 * (len | 1).ilog2(); + crate::slice::sort::unstable::quicksort::quicksort(v, None, limit, is_less); +} diff --git a/library/core/src/slice/sort/unstable/quicksort.rs b/library/core/src/slice/sort/unstable/quicksort.rs new file mode 100644 index 0000000000000..c3b4339d70477 --- /dev/null +++ b/library/core/src/slice/sort/unstable/quicksort.rs @@ -0,0 +1,347 @@ +//! This module contains an unstable quicksort and two partition implementations. + +use crate::intrinsics; +use crate::mem::{self, ManuallyDrop}; +use crate::ptr; + +use crate::slice::sort::shared::pivot::choose_pivot; +use crate::slice::sort::shared::smallsort::UnstableSmallSortTypeImpl; + +/// Sorts `v` recursively. +/// +/// If the slice had a predecessor in the original array, it is specified as `ancestor_pivot`. +/// +/// `limit` is the number of allowed imbalanced partitions before switching to `heapsort`. If zero, +/// this function will immediately switch to heapsort. +pub(crate) fn quicksort<'a, T, F>( + mut v: &'a mut [T], + mut ancestor_pivot: Option<&'a T>, + mut limit: u32, + is_less: &mut F, +) where + F: FnMut(&T, &T) -> bool, +{ + loop { + if v.len() <= T::small_sort_threshold() { + T::small_sort(v, is_less); + return; + } + + // If too many bad pivot choices were made, simply fall back to heapsort in order to + // guarantee `O(N x log(N))` worst-case. + if limit == 0 { + // SAFETY: We assume the `small_sort` threshold is at least 1. + unsafe { + crate::slice::sort::unstable::heapsort::heapsort(v, is_less); + } + return; + } + + limit -= 1; + + // Choose a pivot and try guessing whether the slice is already sorted. + let pivot_pos = choose_pivot(v, is_less); + + // If the chosen pivot is equal to the predecessor, then it's the smallest element in the + // slice. Partition the slice into elements equal to and elements greater than the pivot. + // This case is usually hit when the slice contains many duplicate elements. + if let Some(p) = ancestor_pivot { + // SAFETY: We assume choose_pivot yields an in-bounds position. + if !is_less(p, unsafe { v.get_unchecked(pivot_pos) }) { + let num_lt = partition(v, pivot_pos, &mut |a, b| !is_less(b, a)); + + // Continue sorting elements greater than the pivot. We know that `num_lt` contains + // the pivot. So we can continue after `num_lt`. + v = &mut v[(num_lt + 1)..]; + ancestor_pivot = None; + continue; + } + } + + // Partition the slice. + let num_lt = partition(v, pivot_pos, is_less); + // SAFETY: partition ensures that `num_lt` will be in-bounds. + unsafe { intrinsics::assume(num_lt < v.len()) }; + + // Split the slice into `left`, `pivot`, and `right`. + let (left, right) = v.split_at_mut(num_lt); + let (pivot, right) = right.split_at_mut(1); + let pivot = &pivot[0]; + + // Recurse into the left side. We have a fixed recursion limit, testing shows no real + // benefit for recursing into the shorter side. + quicksort(left, ancestor_pivot, limit, is_less); + + // Continue with the right side. + v = right; + ancestor_pivot = Some(pivot); + } +} + +/// Takes the input slice `v` and re-arranges elements such that when the call returns normally +/// all elements that compare true for `is_less(elem, pivot)` where `pivot == v[pivot_pos]` are +/// on the left side of `v` followed by the other elements, notionally considered greater or +/// equal to `pivot`. +/// +/// Returns the number of elements that are compared true for `is_less(elem, pivot)`. +/// +/// If `is_less` does not implement a total order the resulting order and return value are +/// unspecified. All original elements will remain in `v` and any possible modifications via +/// interior mutability will be observable. Same is true if `is_less` panics or `v.len()` +/// exceeds `scratch.len()`. +pub(crate) fn partition(v: &mut [T], pivot: usize, is_less: &mut F) -> usize +where + F: FnMut(&T, &T) -> bool, +{ + let len = v.len(); + + // Allows for panic-free code-gen by proving this property to the compiler. + if len == 0 { + return 0; + } + + // Allows for panic-free code-gen by proving this property to the compiler. + if pivot >= len { + intrinsics::abort(); + } + + // Place the pivot at the beginning of slice. + v.swap(0, pivot); + let (pivot, v_without_pivot) = v.split_at_mut(1); + + // Assuming that Rust generates noalias LLVM IR we can be sure that a partition function + // signature of the form `(v: &mut [T], pivot: &T)` guarantees that pivot and v can't alias. + // Having this guarantee is crucial for optimizations. It's possible to copy the pivot value + // into a stack value, but this creates issues for types with interior mutability mandating + // a drop guard. + let pivot = &mut pivot[0]; + + // This construct is used to limit the LLVM IR generated, which saves large amounts of + // compile-time by only instantiating the code that is needed. Idea by Frank Steffahn. + let num_lt = (const { inst_partition::() })(v_without_pivot, pivot, is_less); + + // Place the pivot between the two partitions. + v.swap(0, num_lt); + + num_lt +} + +const fn inst_partition bool>() -> fn(&mut [T], &T, &mut F) -> usize { + const MAX_BRANCHLESS_PARTITION_SIZE: usize = 96; + if mem::size_of::() <= MAX_BRANCHLESS_PARTITION_SIZE { + // Specialize for types that are relatively cheap to copy, where branchless optimizations + // have large leverage e.g. `u64` and `String`. + partition_lomuto_branchless_cyclic:: + } else { + partition_hoare_branchy_cyclic:: + } +} + +/// See [`partition`]. +fn partition_hoare_branchy_cyclic(v: &mut [T], pivot: &T, is_less: &mut F) -> usize +where + F: FnMut(&T, &T) -> bool, +{ + let len = v.len(); + + if len == 0 { + return 0; + } + + // Optimized for large types that are expensive to move. Not optimized for integers. Optimized + // for small code-gen, assuming that is_less is an expensive operation that generates + // substantial amounts of code or a call. And that copying elements will likely be a call to + // memcpy. Using 2 `ptr::copy_nonoverlapping` has the chance to be faster than + // `ptr::swap_nonoverlapping` because `memcpy` can use wide SIMD based on runtime feature + // detection. Benchmarks support this analysis. + + let mut gap_opt: Option> = None; + + // SAFETY: The left-to-right scanning loop performs a bounds check, where we know that `left >= + // v_base && left < right && right <= v_base.add(len)`. The right-to-left scanning loop performs + // a bounds check ensuring that `right` is in-bounds. We checked that `len` is more than zero, + // which means that unconditional `right = right.sub(1)` is safe to do. The exit check makes + // sure that `left` and `right` never alias, making `ptr::copy_nonoverlapping` safe. The + // drop-guard `gap` ensures that should `is_less` panic we always overwrite the duplicate in the + // input. `gap.pos` stores the previous value of `right` and starts at `right` and so it too is + // in-bounds. We never pass the saved `gap.value` to `is_less` while it is inside the `GapGuard` + // thus any changes via interior mutability will be observed. + unsafe { + let v_base = v.as_mut_ptr(); + + let mut left = v_base; + let mut right = v_base.add(len); + + loop { + // Find the first element greater than the pivot. + while left < right && is_less(&*left, pivot) { + left = left.add(1); + } + + // Find the last element equal to the pivot. + loop { + right = right.sub(1); + if left >= right || is_less(&*right, pivot) { + break; + } + } + + if left >= right { + break; + } + + // Swap the found pair of out-of-order elements via cyclic permutation. + let is_first_swap_pair = gap_opt.is_none(); + + if is_first_swap_pair { + gap_opt = Some(GapGuard { pos: right, value: ManuallyDrop::new(ptr::read(left)) }); + } + + let gap = gap_opt.as_mut().unwrap_unchecked(); + + // Single place where we instantiate ptr::copy_nonoverlapping in the partition. + if !is_first_swap_pair { + ptr::copy_nonoverlapping(left, gap.pos, 1); + } + gap.pos = right; + ptr::copy_nonoverlapping(right, left, 1); + + left = left.add(1); + } + + left.sub_ptr(v_base) + + // `gap_opt` goes out of scope and overwrites the last wrong-side element on the right side + // with the first wrong-side element of the left side that was initially overwritten by the + // first wrong-side element on the right side element. + } +} + +struct PartitionState { + // The current element that is being looked at, scans left to right through slice. + right: *mut T, + // Counts the number of elements that compared less-than, also works around: + // https://github.com/rust-lang/rust/issues/117128 + num_lt: usize, + // Gap guard that tracks the temporary duplicate in the input. + gap: GapGuardRaw, +} + +fn partition_lomuto_branchless_cyclic(v: &mut [T], pivot: &T, is_less: &mut F) -> usize +where + F: FnMut(&T, &T) -> bool, +{ + // Novel partition implementation by Lukas Bergdoll and Orson Peters. Branchless Lomuto + // partition paired with a cyclic permutation. + // https://github.com/Voultapher/sort-research-rs/blob/main/writeup/lomcyc_partition/text.md + + let len = v.len(); + let v_base = v.as_mut_ptr(); + + if len == 0 { + return 0; + } + + // SAFETY: We checked that `len` is more than zero, which means that reading `v_base` is safe to + // do. From there we have a bounded loop where `v_base.add(i)` is guaranteed in-bounds. `v` and + // `pivot` can't alias because of type system rules. The drop-guard `gap` ensures that should + // `is_less` panic we always overwrite the duplicate in the input. `gap.pos` stores the previous + // value of `right` and starts at `v_base` and so it too is in-bounds. Given `UNROLL_LEN == 2` + // after the main loop we either have A) the last element in `v` that has not yet been processed + // because `len % 2 != 0`, or B) all elements have been processed except the gap value that was + // saved at the beginning with `ptr::read(v_base)`. In the case A) the loop will iterate twice, + // first performing loop_body to take care of the last element that didn't fit into the unroll. + // After that the behavior is the same as for B) where we use the saved value as `right` to + // overwrite the duplicate. If this very last call to `is_less` panics the saved value will be + // copied back including all possible changes via interior mutability. If `is_less` does not + // panic and the code continues we overwrite the duplicate and do `right = right.add(1)`, this + // is safe to do with `&mut *gap.value` because `T` is the same as `[T; 1]` and generating a + // pointer one past the allocation is safe. + unsafe { + let mut loop_body = |state: &mut PartitionState| { + let right_is_lt = is_less(&*state.right, pivot); + let left = v_base.add(state.num_lt); + + ptr::copy(left, state.gap.pos, 1); + ptr::copy_nonoverlapping(state.right, left, 1); + + state.gap.pos = state.right; + state.num_lt += right_is_lt as usize; + + state.right = state.right.add(1); + }; + + // Ideally we could just use GapGuard in PartitionState, but the reference that is + // materialized with `&mut state` when calling `loop_body` would create a mutable reference + // to the parent struct that contains the gap value, invalidating the reference pointer + // created from a reference to the gap value in the cleanup loop. This is only an issue + // under Stacked Borrows, Tree Borrows accepts the intuitive code using GapGuard as valid. + let mut gap_value = ManuallyDrop::new(ptr::read(v_base)); + + let mut state = PartitionState { + num_lt: 0, + right: v_base.add(1), + + gap: GapGuardRaw { pos: v_base, value: &mut *gap_value }, + }; + + // Manual unrolling that works well on x86, Arm and with opt-level=s without murdering + // compile-times. Leaving this to the compiler yields ok to bad results. + let unroll_len = const { if mem::size_of::() <= 16 { 2 } else { 1 } }; + + let unroll_end = v_base.add(len - (unroll_len - 1)); + while state.right < unroll_end { + if unroll_len == 2 { + loop_body(&mut state); + loop_body(&mut state); + } else { + loop_body(&mut state); + } + } + + // Single instantiate `loop_body` for both the unroll cleanup and cyclic permutation + // cleanup. Optimizes binary-size and compile-time. + let end = v_base.add(len); + loop { + let is_done = state.right == end; + state.right = if is_done { state.gap.value } else { state.right }; + + loop_body(&mut state); + + if is_done { + mem::forget(state.gap); + break; + } + } + + state.num_lt + } +} + +struct GapGuard { + pos: *mut T, + value: ManuallyDrop, +} + +impl Drop for GapGuard { + fn drop(&mut self) { + unsafe { + ptr::copy_nonoverlapping(&*self.value, self.pos, 1); + } + } +} + +/// Ideally this wouldn't be needed and we could just use the regular GapGuard. +/// See comment in [`partition_lomuto_branchless_cyclic`]. +struct GapGuardRaw { + pos: *mut T, + value: *mut T, +} + +impl Drop for GapGuardRaw { + fn drop(&mut self) { + unsafe { + ptr::copy_nonoverlapping(self.value, self.pos, 1); + } + } +} diff --git a/library/core/tests/lib.rs b/library/core/tests/lib.rs index 797108a8425de..87e48354dc126 100644 --- a/library/core/tests/lib.rs +++ b/library/core/tests/lib.rs @@ -49,7 +49,6 @@ #![feature(is_sorted)] #![feature(layout_for_ptr)] #![feature(pattern)] -#![feature(sort_internals)] #![feature(slice_take)] #![feature(slice_from_ptr_range)] #![feature(slice_split_once)] diff --git a/library/core/tests/slice.rs b/library/core/tests/slice.rs index ffe8ffcc7f2f7..3870fcb5f0f5f 100644 --- a/library/core/tests/slice.rs +++ b/library/core/tests/slice.rs @@ -1803,9 +1803,8 @@ fn brute_force_rotate_test_1() { #[test] #[cfg(not(target_arch = "wasm32"))] fn sort_unstable() { - use core::cmp::Ordering::{Equal, Greater, Less}; - use core::slice::heapsort; - use rand::{seq::SliceRandom, Rng}; + // use core::cmp::Ordering::{Equal, Greater, Less}; + use rand::Rng; // Miri is too slow (but still need to `chain` to make the types match) let lens = if cfg!(miri) { (2..20).chain(0..0) } else { (2..25).chain(500..510) }; @@ -1839,31 +1838,10 @@ fn sort_unstable() { tmp.copy_from_slice(v); tmp.sort_unstable_by(|a, b| b.cmp(a)); assert!(tmp.windows(2).all(|w| w[0] >= w[1])); - - // Test heapsort using `<` operator. - tmp.copy_from_slice(v); - heapsort(tmp, |a, b| a < b); - assert!(tmp.windows(2).all(|w| w[0] <= w[1])); - - // Test heapsort using `>` operator. - tmp.copy_from_slice(v); - heapsort(tmp, |a, b| a > b); - assert!(tmp.windows(2).all(|w| w[0] >= w[1])); } } } - // Sort using a completely random comparison function. - // This will reorder the elements *somehow*, but won't panic. - for i in 0..v.len() { - v[i] = i as i32; - } - v.sort_unstable_by(|_, _| *[Less, Equal, Greater].choose(&mut rng).unwrap()); - v.sort_unstable(); - for i in 0..v.len() { - assert_eq!(v[i], i as i32); - } - // Should not panic. [0i32; 0].sort_unstable(); [(); 10].sort_unstable(); From 1a6b0e410e3b32f8641b9ee2bcf992712c8fa72f Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Tue, 16 Apr 2024 20:44:01 +0200 Subject: [PATCH 02/13] Fix tidy errors --- library/core/src/slice/sort/select.rs | 5 ++++- library/core/src/slice/sort/shared/smallsort.rs | 15 +++++---------- library/core/src/slice/sort/stable/quicksort.rs | 9 --------- library/core/src/slice/sort/unstable/quicksort.rs | 4 ++++ library/core/tests/slice.rs | 1 - .../src/library-features/sort-internals.md | 5 ----- 6 files changed, 13 insertions(+), 26 deletions(-) delete mode 100644 src/doc/unstable-book/src/library-features/sort-internals.md diff --git a/library/core/src/slice/sort/select.rs b/library/core/src/slice/sort/select.rs index e0c1085916e5e..6212def30416b 100644 --- a/library/core/src/slice/sort/select.rs +++ b/library/core/src/slice/sort/select.rs @@ -92,7 +92,10 @@ fn partition_at_index_loop<'a, T, F>( // slice. Partition the slice into elements equal to and elements greater than the pivot. // This case is usually hit when the slice contains many duplicate elements. if let Some(p) = ancestor_pivot { - if !is_less(p, unsafe { v.get_unchecked(pivot_pos) }) { + // SAFETY: choose_pivot promises to return a valid pivot position. + let pivot = unsafe { v.get_unchecked(pivot_pos) }; + + if !is_less(p, pivot) { let num_lt = partition(v, pivot_pos, &mut |a, b| !is_less(b, a)); // Continue sorting elements greater than the pivot. We know that `mid` contains diff --git a/library/core/src/slice/sort/shared/smallsort.rs b/library/core/src/slice/sort/shared/smallsort.rs index 8dbd45a389ca3..6e4424310e80f 100644 --- a/library/core/src/slice/sort/shared/smallsort.rs +++ b/library/core/src/slice/sort/shared/smallsort.rs @@ -177,6 +177,8 @@ fn small_sort_fallback bool>(v: &mut [T], is_less: &mut F fn small_sort_general bool>(v: &mut [T], is_less: &mut F) { let mut stack_array = MaybeUninit::<[T; SMALL_SORT_GENERAL_SCRATCH_LEN]>::uninit(); + // SAFETY: The memory is backed by `stack_array`, and the operation is safe as long as the len + // is the same. let scratch = unsafe { slice::from_raw_parts_mut( stack_array.as_mut_ptr() as *mut MaybeUninit, @@ -327,8 +329,9 @@ where } // SAFETY: The right side of `v` based on `len_div_2` is guaranteed in-bounds. - region = - unsafe { &mut *ptr::slice_from_raw_parts_mut(v_base.add(len_div_2), len - len_div_2) }; + unsafe { + region = &mut *ptr::slice_from_raw_parts_mut(v_base.add(len_div_2), len - len_div_2) + }; } // SAFETY: We checked that T is Freeze and thus observation safe. @@ -812,14 +815,6 @@ pub(crate) const fn has_efficient_in_place_swap() -> bool { mem::size_of::() <= 8 // mem::size_of::() } -#[test] -fn type_info() { - assert!(has_efficient_in_place_swap::()); - assert!(has_efficient_in_place_swap::()); - assert!(!has_efficient_in_place_swap::()); - assert!(!has_efficient_in_place_swap::()); -} - /// SAFETY: Only used for run-time optimization heuristic. #[rustc_unsafe_specialization_marker] trait CopyMarker {} diff --git a/library/core/src/slice/sort/stable/quicksort.rs b/library/core/src/slice/sort/stable/quicksort.rs index b6d6d3ec8ea73..e1734ce8d8b46 100644 --- a/library/core/src/slice/sort/stable/quicksort.rs +++ b/library/core/src/slice/sort/stable/quicksort.rs @@ -256,12 +256,3 @@ const fn has_direct_interior_mutability() -> bool { // Otherwise a type like Mutex>> could lead to double free. !T::is_freeze() } - -#[test] -fn freeze_check() { - assert!(!has_direct_interior_mutability::()); - assert!(!has_direct_interior_mutability::<[u128; 2]>()); - - assert!(has_direct_interior_mutability::>()); - assert!(has_direct_interior_mutability::>()); -} diff --git a/library/core/src/slice/sort/unstable/quicksort.rs b/library/core/src/slice/sort/unstable/quicksort.rs index c3b4339d70477..533b5b0eec767 100644 --- a/library/core/src/slice/sort/unstable/quicksort.rs +++ b/library/core/src/slice/sort/unstable/quicksort.rs @@ -325,6 +325,8 @@ struct GapGuard { impl Drop for GapGuard { fn drop(&mut self) { + // SAFETY: `self` MUST be constructed in a way that makes copying the gap value into + // `self.pos` sound. unsafe { ptr::copy_nonoverlapping(&*self.value, self.pos, 1); } @@ -340,6 +342,8 @@ struct GapGuardRaw { impl Drop for GapGuardRaw { fn drop(&mut self) { + // SAFETY: `self` MUST be constructed in a way that makes copying the gap value into + // `self.pos` sound. unsafe { ptr::copy_nonoverlapping(self.value, self.pos, 1); } diff --git a/library/core/tests/slice.rs b/library/core/tests/slice.rs index 3870fcb5f0f5f..1fe92afb6bb41 100644 --- a/library/core/tests/slice.rs +++ b/library/core/tests/slice.rs @@ -1803,7 +1803,6 @@ fn brute_force_rotate_test_1() { #[test] #[cfg(not(target_arch = "wasm32"))] fn sort_unstable() { - // use core::cmp::Ordering::{Equal, Greater, Less}; use rand::Rng; // Miri is too slow (but still need to `chain` to make the types match) diff --git a/src/doc/unstable-book/src/library-features/sort-internals.md b/src/doc/unstable-book/src/library-features/sort-internals.md deleted file mode 100644 index 6f2385e53008a..0000000000000 --- a/src/doc/unstable-book/src/library-features/sort-internals.md +++ /dev/null @@ -1,5 +0,0 @@ -# `sort_internals` - -This feature is internal to the Rust compiler and is not intended for general use. - ------------------------- From d20b1190cfb92faab93e0528555753c3823d28d0 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Tue, 16 Apr 2024 20:59:17 +0200 Subject: [PATCH 03/13] Move BufGuard impl outside of function --- library/alloc/src/slice.rs | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/library/alloc/src/slice.rs b/library/alloc/src/slice.rs index 104f4ee753b16..79540fcd3955f 100644 --- a/library/alloc/src/slice.rs +++ b/library/alloc/src/slice.rs @@ -864,18 +864,17 @@ fn stable_sort(v: &mut [T], mut is_less: F) where F: FnMut(&T, &T) -> bool, { - use sort::stable::BufGuard; - - #[unstable(issue = "none", feature = "std_internals")] - impl BufGuard for Vec { - fn with_capacity(capacity: usize) -> Self { - Vec::with_capacity(capacity) - } + sort::stable::sort::>(v, &mut is_less); +} - fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit] { - self.spare_capacity_mut() - } +#[cfg(not(no_global_oom_handling))] +#[unstable(issue = "none", feature = "std_internals")] +impl sort::stable::BufGuard for Vec { + fn with_capacity(capacity: usize) -> Self { + Vec::with_capacity(capacity) } - sort::stable::sort::>(v, &mut is_less); + fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit] { + self.spare_capacity_mut() + } } From 8300f67c6e6b4fd29e04dcea7d0c011ee4a6decc Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Tue, 16 Apr 2024 21:31:27 +0200 Subject: [PATCH 04/13] Turn bare links into automatic links --- library/core/src/slice/sort/stable/mod.rs | 4 ++-- library/core/src/slice/sort/unstable/mod.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/library/core/src/slice/sort/stable/mod.rs b/library/core/src/slice/sort/stable/mod.rs index f9034e1b4d8ef..3d3fe2e70b7b4 100644 --- a/library/core/src/slice/sort/stable/mod.rs +++ b/library/core/src/slice/sort/stable/mod.rs @@ -14,10 +14,10 @@ pub(crate) mod quicksort; /// Stable sort called driftsort by Orson Peters and Lukas Bergdoll. /// Design document: -/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md +/// /// /// Upholds all safety properties outlined here: -/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md +/// #[inline(always)] pub fn sort bool, BufT: BufGuard>(v: &mut [T], is_less: &mut F) { // Arrays of zero-sized types are always all-equal, and thus sorted. diff --git a/library/core/src/slice/sort/unstable/mod.rs b/library/core/src/slice/sort/unstable/mod.rs index 0d0fbd74b388e..692c2d8f7c7ba 100644 --- a/library/core/src/slice/sort/unstable/mod.rs +++ b/library/core/src/slice/sort/unstable/mod.rs @@ -11,10 +11,10 @@ pub(crate) mod quicksort; /// Unstable sort called ipnsort by Lukas Bergdoll. /// Design document: -/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/ipnsort_introduction/text.md +/// /// /// Upholds all safety properties outlined here: -/// https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md +/// #[inline(always)] pub fn sort bool>(v: &mut [T], is_less: &mut F) { // Arrays of zero-sized types are always all-equal, and thus sorted. From 88fb5edb6525bd198208e0b459f70c9ddfb6e558 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Wed, 17 Apr 2024 17:50:05 +0200 Subject: [PATCH 05/13] Fix linkchecker doc errors Also includes small doc fixes. --- library/alloc/src/slice.rs | 6 +++--- library/core/src/slice/mod.rs | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/library/alloc/src/slice.rs b/library/alloc/src/slice.rs index 79540fcd3955f..846f8ba6c6e40 100644 --- a/library/alloc/src/slice.rs +++ b/library/alloc/src/slice.rs @@ -193,7 +193,7 @@ impl [T] { /// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which /// combines the fast average case of quicksort with the fast worst case and partial run /// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs - /// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*. + /// with k distinct elements, the expected time to sort the data is *O(*n* \* log(*k*))*. /// /// The auxiliary memory allocation behavior depends on the input length. Short slices are /// handled without allocation, medium sized slices allocate `self.len()` and beyond that it @@ -252,7 +252,7 @@ impl [T] { /// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which /// combines the fast average case of quicksort with the fast worst case and partial run /// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs - /// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*. + /// with k distinct elements, the expected time to sort the data is *O(*n* \* log(*k*))*. /// /// The auxiliary memory allocation behavior depends on the input length. Short slices are /// handled without allocation, medium sized slices allocate `self.len()` and beyond that it @@ -298,7 +298,7 @@ impl [T] { /// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which /// combines the fast average case of quicksort with the fast worst case and partial run /// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs - /// with k distinct elements, the expected time to sort the data is *O(*n* log(*k*))*. + /// with k distinct elements, the expected time to sort the data is *O(*n* \* log(*k*))*. /// /// The auxiliary memory allocation behavior depends on the input length. Short slices are /// handled without allocation, medium sized slices allocate `self.len()` and beyond that it diff --git a/library/core/src/slice/mod.rs b/library/core/src/slice/mod.rs index 4e6070cdb1b57..3c818eacca318 100644 --- a/library/core/src/slice/mod.rs +++ b/library/core/src/slice/mod.rs @@ -2893,7 +2893,7 @@ impl [T] { /// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which /// combines the fast average case of quicksort with the fast worst case of heapsort, achieving /// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the - /// expected time to sort the data is *O(*n* log(*k*))*. + /// expected time to sort the data is *O(*n* \* log(*k*))*. /// /// It is typically faster than stable sorting, except in a few special cases, e.g., when the /// slice is partially sorted. @@ -2950,7 +2950,7 @@ impl [T] { /// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which /// combines the fast average case of quicksort with the fast worst case of heapsort, achieving /// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the - /// expected time to sort the data is *O(*n* log(*k*))*. + /// expected time to sort the data is *O(*n* \* log(*k*))*. /// /// It is typically faster than stable sorting, except in a few special cases, e.g., when the /// slice is partially sorted. @@ -2994,7 +2994,7 @@ impl [T] { /// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which /// combines the fast average case of quicksort with the fast worst case of heapsort, achieving /// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the - /// expected time to sort the data is *O(*n* log(*k*))*. + /// expected time to sort the data is *O(*n* \* log(*k*))*. /// /// It is typically faster than stable sorting, except in a few special cases, e.g., when the /// slice is partially sorted. @@ -3042,8 +3042,8 @@ impl [T] { /// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime /// for all inputs. /// - /// It is typically faster than sorting, except in a few special cases, e.g., when the slice is - /// nearly fully sorted, where [`slice::sort`] may be faster. + /// It is typically faster than stable sorting, except in a few special cases, e.g., when the + /// slice is nearly fully sorted, where `slice::sort` may be faster. /// /// [`sort_unstable`]: slice::sort_unstable /// @@ -3103,8 +3103,8 @@ impl [T] { /// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime /// for all inputs. /// - /// It is typically faster than sorting, except in a few special cases, e.g., when the slice is - /// nearly fully sorted, where [`slice::sort`] may be faster. + /// It is typically faster than stable sorting, except in a few special cases, e.g., when the + /// slice is nearly fully sorted, where `slice::sort` may be faster. /// /// [`sort_unstable`]: slice::sort_unstable /// @@ -3168,8 +3168,8 @@ impl [T] { /// Median of Medians using Tukey's Ninther for pivot selection, which guarantees linear runtime /// for all inputs. /// - /// It is typically faster than sorting, except in a few special cases, e.g., when the slice is - /// nearly fully sorted, where [`slice::sort`] may be faster. + /// It is typically faster than stable sorting, except in a few special cases, e.g., when the + /// slice is nearly fully sorted, where `slice::sort` may be faster. /// /// [`sort_unstable`]: slice::sort_unstable /// From f7496f5f85fccadc4cb81704b78a5ffe3db16ea2 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Sun, 16 Jun 2024 08:27:49 +0200 Subject: [PATCH 06/13] Apply review comments --- .../core/src/slice/sort/shared/smallsort.rs | 38 +++-- library/core/src/slice/sort/stable/merge.rs | 150 +++++++++--------- 2 files changed, 98 insertions(+), 90 deletions(-) diff --git a/library/core/src/slice/sort/shared/smallsort.rs b/library/core/src/slice/sort/shared/smallsort.rs index 6e4424310e80f..7cd351c9f2516 100644 --- a/library/core/src/slice/sort/shared/smallsort.rs +++ b/library/core/src/slice/sort/shared/smallsort.rs @@ -94,9 +94,9 @@ impl UnstableSmallSortTypeImpl for T { #[inline(always)] fn small_sort_threshold() -> usize { match const { choose_unstable_small_sort::() } { - UnstalbeSmallSort::Fallback => SMALL_SORT_FALLBACK_THRESHOLD, - UnstalbeSmallSort::General => SMALL_SORT_GENERAL_THRESHOLD, - UnstalbeSmallSort::Network => SMALL_SORT_NETWORK_THRESHOLD, + UnstableSmallSort::Fallback => SMALL_SORT_FALLBACK_THRESHOLD, + UnstableSmallSort::General => SMALL_SORT_GENERAL_THRESHOLD, + UnstableSmallSort::Network => SMALL_SORT_NETWORK_THRESHOLD, } } @@ -137,34 +137,34 @@ const SMALL_SORT_NETWORK_SCRATCH_LEN: usize = SMALL_SORT_NETWORK_THRESHOLD; /// within this limit. const MAX_STACK_ARRAY_SIZE: usize = 4096; -enum UnstalbeSmallSort { +enum UnstableSmallSort { Fallback, General, Network, } -const fn choose_unstable_small_sort() -> UnstalbeSmallSort { +const fn choose_unstable_small_sort() -> UnstableSmallSort { if T::is_copy() && has_efficient_in_place_swap::() && (mem::size_of::() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE { // Heuristic for int like types. - return UnstalbeSmallSort::Network; + return UnstableSmallSort::Network; } if (mem::size_of::() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE { - return UnstalbeSmallSort::General; + return UnstableSmallSort::General; } - UnstalbeSmallSort::Fallback + UnstableSmallSort::Fallback } const fn inst_unstable_small_sort bool>() -> fn(&mut [T], &mut F) { match const { choose_unstable_small_sort::() } { - UnstalbeSmallSort::Fallback => small_sort_fallback::, - UnstalbeSmallSort::General => small_sort_general::, - UnstalbeSmallSort::Network => small_sort_network::, + UnstableSmallSort::Fallback => small_sort_fallback::, + UnstableSmallSort::General => small_sort_general::, + UnstableSmallSort::Network => small_sort_network::, } } @@ -384,8 +384,12 @@ where } } -// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no -// performance impact. +/// Sorts the first 9 elements of `v` with a fast fixed function. +/// +/// Should `is_less` generate substantial amounts of code the compiler can choose to not inline +/// `swap_if_less`. If the code of a sort impl changes so as to call this function in multiple +/// places, `#[inline(never)]` is recommended to keep binary-size in check. The current design of +/// `small_sort_network` makes sure to only call this once. fn sort9_optimal(v: &mut [T], is_less: &mut F) where F: FnMut(&T, &T) -> bool, @@ -429,8 +433,12 @@ where } } -// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no -// performance impact. +/// Sorts the first 13 elements of `v` with a fast fixed function. +/// +/// Should `is_less` generate substantial amounts of code the compiler can choose to not inline +/// `swap_if_less`. If the code of a sort impl changes so as to call this function in multiple +/// places, `#[inline(never)]` is recommended to keep binary-size in check. The current design of +/// `small_sort_network` makes sure to only call this once. fn sort13_optimal(v: &mut [T], is_less: &mut F) where F: FnMut(&T, &T) -> bool, diff --git a/library/core/src/slice/sort/stable/merge.rs b/library/core/src/slice/sort/stable/merge.rs index 4f6a98bc7b89f..6739e114b130a 100644 --- a/library/core/src/slice/sort/stable/merge.rs +++ b/library/core/src/slice/sort/stable/merge.rs @@ -61,91 +61,91 @@ pub fn merge bool>( // Finally, `merge_state` gets dropped. If the shorter run was not fully // consumed, whatever remains of it will now be copied into the hole in `v`. } +} - // When dropped, copies the range `start..end` into `dst..`. - struct MergeState { - start: *mut T, - end: *mut T, - dst: *mut T, - } +// When dropped, copies the range `start..end` into `dst..`. +struct MergeState { + start: *mut T, + end: *mut T, + dst: *mut T, +} - impl MergeState { - /// # Safety - /// The caller MUST guarantee that `self` is initialized in a way where `start -> end` is - /// the longer sub-slice and so that `dst` can be written to at least the shorter sub-slice - /// length times. In addition `start -> end` and `right -> right_end` MUST be valid to be - /// read. This function MUST only be called once. - unsafe fn merge_up bool>( - &mut self, - mut right: *const T, - right_end: *const T, - is_less: &mut F, - ) { - // SAFETY: See function safety comment. - unsafe { - let left = &mut self.start; - let out = &mut self.dst; - - while *left != self.end && right as *const T != right_end { - let consume_left = !is_less(&*right, &**left); - - let src = if consume_left { *left } else { right }; - ptr::copy_nonoverlapping(src, *out, 1); - - *left = left.add(consume_left as usize); - right = right.add(!consume_left as usize); - - *out = out.add(1); - } +impl MergeState { + /// # Safety + /// The caller MUST guarantee that `self` is initialized in a way where `start -> end` is + /// the longer sub-slice and so that `dst` can be written to at least the shorter sub-slice + /// length times. In addition `start -> end` and `right -> right_end` MUST be valid to be + /// read. This function MUST only be called once. + unsafe fn merge_up bool>( + &mut self, + mut right: *const T, + right_end: *const T, + is_less: &mut F, + ) { + // SAFETY: See function safety comment. + unsafe { + let left = &mut self.start; + let out = &mut self.dst; + + while *left != self.end && right as *const T != right_end { + let consume_left = !is_less(&*right, &**left); + + let src = if consume_left { *left } else { right }; + ptr::copy_nonoverlapping(src, *out, 1); + + *left = left.add(consume_left as usize); + right = right.add(!consume_left as usize); + + *out = out.add(1); } } + } - /// # Safety - /// The caller MUST guarantee that `self` is initialized in a way where `left_end <- dst` is - /// the shorter sub-slice and so that `out` can be written to at least the shorter sub-slice - /// length times. In addition `left_end <- dst` and `right_end <- end` MUST be valid to be - /// read. This function MUST only be called once. - unsafe fn merge_down bool>( - &mut self, - left_end: *const T, - right_end: *const T, - mut out: *mut T, - is_less: &mut F, - ) { - // SAFETY: See function safety comment. - unsafe { - loop { - let left = self.dst.sub(1); - let right = self.end.sub(1); - out = out.sub(1); - - let consume_left = is_less(&*right, &*left); - - let src = if consume_left { left } else { right }; - ptr::copy_nonoverlapping(src, out, 1); - - self.dst = left.add(!consume_left as usize); - self.end = right.add(consume_left as usize); - - if self.dst as *const T == left_end || self.end as *const T == right_end { - break; - } + /// # Safety + /// The caller MUST guarantee that `self` is initialized in a way where `left_end <- dst` is + /// the shorter sub-slice and so that `out` can be written to at least the shorter sub-slice + /// length times. In addition `left_end <- dst` and `right_end <- end` MUST be valid to be + /// read. This function MUST only be called once. + unsafe fn merge_down bool>( + &mut self, + left_end: *const T, + right_end: *const T, + mut out: *mut T, + is_less: &mut F, + ) { + // SAFETY: See function safety comment. + unsafe { + loop { + let left = self.dst.sub(1); + let right = self.end.sub(1); + out = out.sub(1); + + let consume_left = is_less(&*right, &*left); + + let src = if consume_left { left } else { right }; + ptr::copy_nonoverlapping(src, out, 1); + + self.dst = left.add(!consume_left as usize); + self.end = right.add(consume_left as usize); + + if self.dst as *const T == left_end || self.end as *const T == right_end { + break; } } } } +} - impl Drop for MergeState { - fn drop(&mut self) { - // SAFETY: The user of MergeState MUST ensure, that at any point this drop - // impl MAY run, for example when the user provided `is_less` panics, that - // copying the contiguous region between `start` and `end` to `dst` will - // leave the input slice `v` with each original element and all possible - // modifications observed. - unsafe { - let len = self.end.sub_ptr(self.start); - ptr::copy_nonoverlapping(self.start, self.dst, len); - } +impl Drop for MergeState { + fn drop(&mut self) { + // SAFETY: The user of MergeState MUST ensure, that at any point this drop + // impl MAY run, for example when the user provided `is_less` panics, that + // copying the contiguous region between `start` and `end` to `dst` will + // leave the input slice `v` with each original element and all possible + // modifications observed. + unsafe { + let len = self.end.sub_ptr(self.start); + ptr::copy_nonoverlapping(self.start, self.dst, len); } } } From 9889ab1f6032d4ab38aa2aeffac002093e4f6559 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Sun, 16 Jun 2024 17:17:36 +0200 Subject: [PATCH 07/13] Remove reliance on const_trait in sort implementations const_trait in conjunction with specialization was deemed not ready for usage in this scenario. So instead a two-stage trait specialization approach is used. This approach is likely worse for compile-times. Future work that enables const_trait can revert back to the previous version as outlined in the comment marked FIXME(effects). --- .../core/src/slice/sort/shared/smallsort.rs | 123 +++++++++--------- .../core/src/slice/sort/stable/quicksort.rs | 7 +- 2 files changed, 66 insertions(+), 64 deletions(-) diff --git a/library/core/src/slice/sort/shared/smallsort.rs b/library/core/src/slice/sort/shared/smallsort.rs index 7cd351c9f2516..9a0f7bf2f51d4 100644 --- a/library/core/src/slice/sort/shared/smallsort.rs +++ b/library/core/src/slice/sort/shared/smallsort.rs @@ -93,10 +93,62 @@ impl UnstableSmallSortTypeImpl for T { impl UnstableSmallSortTypeImpl for T { #[inline(always)] fn small_sort_threshold() -> usize { - match const { choose_unstable_small_sort::() } { - UnstableSmallSort::Fallback => SMALL_SORT_FALLBACK_THRESHOLD, - UnstableSmallSort::General => SMALL_SORT_GENERAL_THRESHOLD, - UnstableSmallSort::Network => SMALL_SORT_NETWORK_THRESHOLD, + ::small_sort_threshold() + } + + #[inline(always)] + fn small_sort(v: &mut [T], is_less: &mut F) + where + F: FnMut(&T, &T) -> bool, + { + ::small_sort(v, is_less); + } +} + +/// FIXME(effects) use original ipnsort approach with choose_unstable_small_sort, +/// as found here https://github.com/Voultapher/sort-research-rs/blob/438fad5d0495f65d4b72aa87f0b62fc96611dff3/ipnsort/src/smallsort.rs#L83C10-L83C36. +pub(crate) trait UnstableSmallSortFreezeTypeImpl: Sized + FreezeMarker { + fn small_sort_threshold() -> usize; + + fn small_sort bool>(v: &mut [Self], is_less: &mut F); +} + +impl UnstableSmallSortFreezeTypeImpl for T { + #[inline(always)] + default fn small_sort_threshold() -> usize { + if (mem::size_of::() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE { + SMALL_SORT_GENERAL_THRESHOLD + } else { + SMALL_SORT_FALLBACK_THRESHOLD + } + } + + #[inline(always)] + default fn small_sort(v: &mut [T], is_less: &mut F) + where + F: FnMut(&T, &T) -> bool, + { + if (mem::size_of::() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE { + small_sort_general(v, is_less); + } else { + small_sort_fallback(v, is_less); + } + } +} + +/// SAFETY: Only used for run-time optimization heuristic. +#[rustc_unsafe_specialization_marker] +trait CopyMarker {} + +impl UnstableSmallSortFreezeTypeImpl for T { + #[inline(always)] + fn small_sort_threshold() -> usize { + if has_efficient_in_place_swap::() + && (mem::size_of::() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE + { + SMALL_SORT_NETWORK_SCRATCH_LEN + } else { + SMALL_SORT_FALLBACK_THRESHOLD } } @@ -105,9 +157,13 @@ impl UnstableSmallSortTypeImpl for T { where F: FnMut(&T, &T) -> bool, { - // This construct is used to limit the LLVM IR generated, which saves large amounts of - // compile-time by only instantiating the code that is needed. Idea by Frank Steffahn. - (const { inst_unstable_small_sort::() })(v, is_less); + if has_efficient_in_place_swap::() + && (mem::size_of::() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE + { + small_sort_network(v, is_less); + } else { + small_sort_fallback(v, is_less); + } } } @@ -137,37 +193,6 @@ const SMALL_SORT_NETWORK_SCRATCH_LEN: usize = SMALL_SORT_NETWORK_THRESHOLD; /// within this limit. const MAX_STACK_ARRAY_SIZE: usize = 4096; -enum UnstableSmallSort { - Fallback, - General, - Network, -} - -const fn choose_unstable_small_sort() -> UnstableSmallSort { - if T::is_copy() - && has_efficient_in_place_swap::() - && (mem::size_of::() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE - { - // Heuristic for int like types. - return UnstableSmallSort::Network; - } - - if (mem::size_of::() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE { - return UnstableSmallSort::General; - } - - UnstableSmallSort::Fallback -} - -const fn inst_unstable_small_sort bool>() --> fn(&mut [T], &mut F) { - match const { choose_unstable_small_sort::() } { - UnstableSmallSort::Fallback => small_sort_fallback::, - UnstableSmallSort::General => small_sort_general::, - UnstableSmallSort::Network => small_sort_network::, - } -} - fn small_sort_fallback bool>(v: &mut [T], is_less: &mut F) { if v.len() >= 2 { insertion_sort_shift_left(v, 1, is_less); @@ -822,25 +847,3 @@ pub(crate) const fn has_efficient_in_place_swap() -> bool { // Heuristic that holds true on all tested 64-bit capable architectures. mem::size_of::() <= 8 // mem::size_of::() } - -/// SAFETY: Only used for run-time optimization heuristic. -#[rustc_unsafe_specialization_marker] -trait CopyMarker {} - -impl CopyMarker for T {} - -#[const_trait] -trait IsCopy { - fn is_copy() -> bool; -} - -impl const IsCopy for T { - default fn is_copy() -> bool { - false - } -} -impl const IsCopy for T { - fn is_copy() -> bool { - true - } -} diff --git a/library/core/src/slice/sort/stable/quicksort.rs b/library/core/src/slice/sort/stable/quicksort.rs index e1734ce8d8b46..181fe603d2325 100644 --- a/library/core/src/slice/sort/stable/quicksort.rs +++ b/library/core/src/slice/sort/stable/quicksort.rs @@ -233,24 +233,23 @@ impl PartitionState { } } -#[const_trait] trait IsFreeze { fn is_freeze() -> bool; } -impl const IsFreeze for T { +impl IsFreeze for T { default fn is_freeze() -> bool { false } } -impl const IsFreeze for T { +impl IsFreeze for T { fn is_freeze() -> bool { true } } #[must_use] -const fn has_direct_interior_mutability() -> bool { +fn has_direct_interior_mutability() -> bool { // If a type has interior mutability it may alter itself during comparison // in a way that must be preserved after the sort operation concludes. // Otherwise a type like Mutex>> could lead to double free. From 24697ac66b1cb7e314cad5166872ae54d039856c Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Sun, 16 Jun 2024 20:26:33 +0200 Subject: [PATCH 08/13] Fix doc-link issue --- library/core/src/slice/sort/shared/smallsort.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/core/src/slice/sort/shared/smallsort.rs b/library/core/src/slice/sort/shared/smallsort.rs index 9a0f7bf2f51d4..e208f7143d441 100644 --- a/library/core/src/slice/sort/shared/smallsort.rs +++ b/library/core/src/slice/sort/shared/smallsort.rs @@ -106,7 +106,7 @@ impl UnstableSmallSortTypeImpl for T { } /// FIXME(effects) use original ipnsort approach with choose_unstable_small_sort, -/// as found here https://github.com/Voultapher/sort-research-rs/blob/438fad5d0495f65d4b72aa87f0b62fc96611dff3/ipnsort/src/smallsort.rs#L83C10-L83C36. +/// as found here . pub(crate) trait UnstableSmallSortFreezeTypeImpl: Sized + FreezeMarker { fn small_sort_threshold() -> usize; From 032ad4c4c6b6fe94947209a7f5cee595a3f225d1 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Mon, 17 Jun 2024 07:28:18 +0200 Subject: [PATCH 09/13] Fix unintended regression for Freeze + Copy types Freeze + Copy types should be allowed to choose between all three small-sort variants. With the recent changes to small-sort selection, a regression was added that only let such types choose between network and fallback. It can now also choose general where appropriate. --- library/core/src/slice/sort/shared/smallsort.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/library/core/src/slice/sort/shared/smallsort.rs b/library/core/src/slice/sort/shared/smallsort.rs index e208f7143d441..567a7090ecd7d 100644 --- a/library/core/src/slice/sort/shared/smallsort.rs +++ b/library/core/src/slice/sort/shared/smallsort.rs @@ -146,7 +146,9 @@ impl UnstableSmallSortFreezeTypeImpl for T { if has_efficient_in_place_swap::() && (mem::size_of::() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE { - SMALL_SORT_NETWORK_SCRATCH_LEN + SMALL_SORT_NETWORK_THRESHOLD + } else if (mem::size_of::() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE { + SMALL_SORT_GENERAL_THRESHOLD } else { SMALL_SORT_FALLBACK_THRESHOLD } @@ -161,6 +163,8 @@ impl UnstableSmallSortFreezeTypeImpl for T { && (mem::size_of::() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE { small_sort_network(v, is_less); + } else if (mem::size_of::() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE { + small_sort_general(v, is_less); } else { small_sort_fallback(v, is_less); } From 732616998c06d8d9024db3e1f62e64ad0b50ed30 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Mon, 17 Jun 2024 22:05:35 +0200 Subject: [PATCH 10/13] Revert panic_safe test changes The changes made only a limited improvement for the current small miri coverage and in general test coverage of the sort implementations. But they exploded test times from ~13s to ~240s, which is not deemed worth it. --- library/alloc/src/slice/tests.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/library/alloc/src/slice/tests.rs b/library/alloc/src/slice/tests.rs index 63a176a5ace38..0156b9928dab4 100644 --- a/library/alloc/src/slice/tests.rs +++ b/library/alloc/src/slice/tests.rs @@ -197,7 +197,8 @@ fn panic_safe() { let mut rng = test_rng(); - let lens = if cfg!(miri) { (1..10).chain(30..36) } else { (1..20).chain(70..MAX_LEN) }; + // Miri is too slow (but still need to `chain` to make the types match) + let lens = if cfg!(miri) { (1..10).chain(0..0) } else { (1..20).chain(70..MAX_LEN) }; let moduli: &[u32] = if cfg!(miri) { &[5] } else { &[5, 20, 50] }; for len in lens { From b7deff3a0de56941b554e6a116f08c697e59ee51 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Mon, 17 Jun 2024 22:32:01 +0200 Subject: [PATCH 11/13] Add missing CopyMarker impl Due to refactoring the const_trait usage, the CopyMarker impl was accidentally deleted, which had the consequence that the Copy specialization for the small-sort was never picked. --- library/core/src/slice/sort/shared/smallsort.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/library/core/src/slice/sort/shared/smallsort.rs b/library/core/src/slice/sort/shared/smallsort.rs index 567a7090ecd7d..5111ed8756bf1 100644 --- a/library/core/src/slice/sort/shared/smallsort.rs +++ b/library/core/src/slice/sort/shared/smallsort.rs @@ -140,6 +140,8 @@ impl UnstableSmallSortFreezeTypeImpl for T { #[rustc_unsafe_specialization_marker] trait CopyMarker {} +impl CopyMarker for T {} + impl UnstableSmallSortFreezeTypeImpl for T { #[inline(always)] fn small_sort_threshold() -> usize { From 5caa7f9a4a2455e82da9bf1171937f024e9e026c Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Thu, 20 Jun 2024 18:03:42 +0200 Subject: [PATCH 12/13] Improve html-checker error message The previous message omits which of the dozens of tools called tidy is meant. And it's written in a way that one can easily miss the *not*, thinking it reads "Note that `tidy` is the in-tree `src/tools/tidy` but needs to be installed". The error message should hopefully help future contributors. --- src/bootstrap/src/core/build_steps/test.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bootstrap/src/core/build_steps/test.rs b/src/bootstrap/src/core/build_steps/test.rs index 360bd3840d456..8b65085e14e79 100644 --- a/src/bootstrap/src/core/build_steps/test.rs +++ b/src/bootstrap/src/core/build_steps/test.rs @@ -201,7 +201,7 @@ impl Step for HtmlCheck { if !check_if_tidy_is_installed() { eprintln!("not running HTML-check tool because `tidy` is missing"); eprintln!( - "Note that `tidy` is not the in-tree `src/tools/tidy` but needs to be installed" + "You need the HTML tidy tool https://www.html-tidy.org/, this tool is *not* part of the rust project and needs to be installed separately, for example via your package manager." ); panic!("Cannot run html-check tests"); } From a895ef77f062325f6b9cf7bf7e422a3008c3cd17 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Thu, 20 Jun 2024 18:07:04 +0200 Subject: [PATCH 13/13] Fix wrong big O star bracing in the doc comments --- library/alloc/src/slice.rs | 6 +++--- library/core/src/slice/mod.rs | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/library/alloc/src/slice.rs b/library/alloc/src/slice.rs index 846f8ba6c6e40..c7960b3fb49c3 100644 --- a/library/alloc/src/slice.rs +++ b/library/alloc/src/slice.rs @@ -193,7 +193,7 @@ impl [T] { /// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which /// combines the fast average case of quicksort with the fast worst case and partial run /// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs - /// with k distinct elements, the expected time to sort the data is *O(*n* \* log(*k*))*. + /// with k distinct elements, the expected time to sort the data is *O*(*n* \* log(*k*)). /// /// The auxiliary memory allocation behavior depends on the input length. Short slices are /// handled without allocation, medium sized slices allocate `self.len()` and beyond that it @@ -252,7 +252,7 @@ impl [T] { /// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which /// combines the fast average case of quicksort with the fast worst case and partial run /// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs - /// with k distinct elements, the expected time to sort the data is *O(*n* \* log(*k*))*. + /// with k distinct elements, the expected time to sort the data is *O*(*n* \* log(*k*)). /// /// The auxiliary memory allocation behavior depends on the input length. Short slices are /// handled without allocation, medium sized slices allocate `self.len()` and beyond that it @@ -298,7 +298,7 @@ impl [T] { /// The current implementation is based on [driftsort] by Orson Peters and Lukas Bergdoll, which /// combines the fast average case of quicksort with the fast worst case and partial run /// detection of mergesort, achieving linear time on fully sorted and reversed inputs. On inputs - /// with k distinct elements, the expected time to sort the data is *O(*n* \* log(*k*))*. + /// with k distinct elements, the expected time to sort the data is *O*(*n* \* log(*k*)). /// /// The auxiliary memory allocation behavior depends on the input length. Short slices are /// handled without allocation, medium sized slices allocate `self.len()` and beyond that it diff --git a/library/core/src/slice/mod.rs b/library/core/src/slice/mod.rs index 3c818eacca318..d68dde823541a 100644 --- a/library/core/src/slice/mod.rs +++ b/library/core/src/slice/mod.rs @@ -2893,7 +2893,7 @@ impl [T] { /// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which /// combines the fast average case of quicksort with the fast worst case of heapsort, achieving /// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the - /// expected time to sort the data is *O(*n* \* log(*k*))*. + /// expected time to sort the data is *O*(*n* \* log(*k*)). /// /// It is typically faster than stable sorting, except in a few special cases, e.g., when the /// slice is partially sorted. @@ -2950,7 +2950,7 @@ impl [T] { /// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which /// combines the fast average case of quicksort with the fast worst case of heapsort, achieving /// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the - /// expected time to sort the data is *O(*n* \* log(*k*))*. + /// expected time to sort the data is *O*(*n* \* log(*k*)). /// /// It is typically faster than stable sorting, except in a few special cases, e.g., when the /// slice is partially sorted. @@ -2994,7 +2994,7 @@ impl [T] { /// The current implementation is based on [ipnsort] by Lukas Bergdoll and Orson Peters, which /// combines the fast average case of quicksort with the fast worst case of heapsort, achieving /// linear time on fully sorted and reversed inputs. On inputs with k distinct elements, the - /// expected time to sort the data is *O(*n* \* log(*k*))*. + /// expected time to sort the data is *O*(*n* \* log(*k*)). /// /// It is typically faster than stable sorting, except in a few special cases, e.g., when the /// slice is partially sorted.