Auto merge of #128204 - GuillaumeGomez:integers-opti, r=<try>

Small optimization for integers Display implementation This is a first pass to try to speed up a bit integers `Display` implementation. The idea behind this is to reduce the stack usage for the buffer storing the output (shouldn't be visible in bench normally) and some small specialization which benefits a lot to smaller integers like `u8` and `i8`. Here are the results of the benchmarks: | bench name | current std | with this PR | |-|-|-| | bench_std_fmt::bench_i16_0 | 16.45 ns/iter (+/- 0.25) | 16.50 ns/iter (+/- 0.15) | | bench_std_fmt::bench_i16_max | 17.83 ns/iter (+/- 0.66) | 17.58 ns/iter (+/- 0.10) | | bench_std_fmt::bench_i16_min | 20.97 ns/iter (+/- 0.49) | 20.50 ns/iter (+/- 0.28) | | bench_std_fmt::bench_i32_0 | 16.63 ns/iter (+/- 0.06) | 16.62 ns/iter (+/- 0.07) | | bench_std_fmt::bench_i32_max | 19.79 ns/iter (+/- 0.43) | 19.55 ns/iter (+/- 0.14) | | bench_std_fmt::bench_i32_min | 22.97 ns/iter (+/- 0.50) | 22.08 ns/iter (+/- 0.08) | | bench_std_fmt::bench_i64_0 | 16.63 ns/iter (+/- 0.39) | 16.69 ns/iter (+/- 0.44) | | bench_std_fmt::bench_i64_half | 19.60 ns/iter (+/- 0.05) | 19.10 ns/iter (+/- 0.05) | | bench_std_fmt::bench_i64_max | 25.22 ns/iter (+/- 0.34) | 24.43 ns/iter (+/- 0.02) | | bench_std_fmt::bench_i8_0 | 16.27 ns/iter (+/- 0.32) | 15.80 ns/iter (+/- 0.17) | | bench_std_fmt::bench_i8_max | 16.71 ns/iter (+/- 0.09) | 16.25 ns/iter (+/- 0.01) | | bench_std_fmt::bench_i8_min | 20.07 ns/iter (+/- 0.22) | 19.80 ns/iter (+/- 0.30) | | bench_std_fmt::bench_u128_0 | 21.37 ns/iter (+/- 0.24) | 21.35 ns/iter (+/- 0.35) | | bench_std_fmt::bench_u128_max | 48.13 ns/iter (+/- 0.20) | 48.78 ns/iter (+/- 0.29) | | bench_std_fmt::bench_u16_0 | 16.48 ns/iter (+/- 0.46) | 16.03 ns/iter (+/- 0.39) | | bench_std_fmt::bench_u16_max | 17.31 ns/iter (+/- 0.32) | 17.41 ns/iter (+/- 0.32) | | bench_std_fmt::bench_u16_min | 16.40 ns/iter (+/- 0.45) | 16.02 ns/iter (+/- 0.39) | | bench_std_fmt::bench_u32_0 | 16.17 ns/iter (+/- 0.04) | 16.29 ns/iter (+/- 0.16) | | bench_std_fmt::bench_u32_max | 19.00 ns/iter (+/- 0.10) | 19.16 ns/iter (+/- 0.28) | | bench_std_fmt::bench_u32_min | 16.16 ns/iter (+/- 0.09) | 16.28 ns/iter (+/- 0.11) | | bench_std_fmt::bench_u64_0 | 16.22 ns/iter (+/- 0.22) | 16.14 ns/iter (+/- 0.18) | | bench_std_fmt::bench_u64_half | 19.25 ns/iter (+/- 0.07) | 18.95 ns/iter (+/- 0.05) | | bench_std_fmt::bench_u64_max | 24.31 ns/iter (+/- 0.08) | 24.18 ns/iter (+/- 0.08) | | bench_std_fmt::bench_u8_0 | 15.76 ns/iter (+/- 0.08) | 15.66 ns/iter (+/- 0.08) | | bench_std_fmt::bench_u8_max | 16.53 ns/iter (+/- 0.03) | 16.29 ns/iter (+/- 0.02) | | bench_std_fmt::bench_u8_min | 15.77 ns/iter (+/- 0.06) | 15.67 ns/iter (+/- 0.02) | The source code is: <details> <summary>source code</summary> ```rust #![feature(test)] #![allow(non_snake_case)] #![allow(clippy::cast_lossless)] extern crate test; macro_rules! benches { ($($name:ident($value:expr))*) => { mod bench_std_fmt { use std::io::Write; use test::{Bencher, black_box}; $( #[bench] fn $name(b: &mut Bencher) { let mut buf = Vec::with_capacity(40); b.iter(|| { buf.clear(); write!(&mut buf, "{}", black_box($value)).unwrap(); black_box(&buf); }); } )* } } } benches! { bench_u64_0(0u64) bench_u64_half(u32::max_value() as u64) bench_u64_max(u64::max_value()) bench_i64_0(0i64) bench_i64_half(i32::max_value() as i64) bench_i64_max(i64::max_value()) bench_u16_0(0u16) bench_u16_min(u16::min_value()) bench_u16_max(u16::max_value()) bench_i16_0(0i16) bench_i16_min(i16::min_value()) bench_i16_max(i16::max_value()) bench_u128_0(0u128) bench_u128_max(u128::max_value()) bench_i8_0(0i8) bench_i8_min(i8::min_value()) bench_i8_max(i8::max_value()) bench_u8_0(0u8) bench_u8_min(u8::min_value()) bench_u8_max(u8::max_value()) bench_u32_0(0u32) bench_u32_min(u32::min_value()) bench_u32_max(u32::max_value()) bench_i32_0(0i32) bench_i32_min(i32::min_value()) bench_i32_max(i32::max_value()) } ``` </details> And then I ran the equivalent code (source code below) in callgrind with [callgrind_differ](https://github.com/Ethiraric/callgrind_differ) to generate a nice output and here's the result: ``` core::fmt::num::imp::<impl core::fmt::Display for i16>::fmt | 1300000 | - 70000 - 5.385% 1230000 core::fmt::num::imp::<impl core::fmt::Display for i32>::fmt | 1910000 | - 100000 - 5.236% 1810000 core::fmt::num::imp::<impl core::fmt::Display for i64>::fmt | 2430000 | - 110000 - 4.527% 2320000 core::fmt::num::imp::<impl core::fmt::Display for i8>::fmt | 1080000 | - 170000 - 15.741% 910000 core::fmt::num::imp::<impl core::fmt::Display for u16>::fmt | 960000 | + 10000 + 1.042% 970000 core::fmt::num::imp::<impl core::fmt::Display for u32>::fmt | 1300000 | + 30000 + 2.308% 1330000 core::fmt::num::imp::<impl core::fmt::Display for u8>::fmt | 820000 | - 30000 - 3.659% 790000 ``` <details> <summary>Source code</summary> ```rust #![feature(test)] extern crate test; use std::io::{stdout, Write}; use std::io::StdoutLock; use test::black_box; macro_rules! benches { ($handle:ident, $buf:ident, $($name:ident($value:expr))*) => { $( fn $name(handle: &mut StdoutLock, buf: &mut Vec<u8>) { for _ in 0..10000 { buf.clear(); write!(buf, "{}", black_box($value)).unwrap(); handle.write_all(buf); } } $name(&mut $handle, &mut $buf); )* } } fn main() { let mut handle = stdout().lock(); let mut buf = Vec::with_capacity(40); benches! { handle, buf, bench_u64_0(0u64) bench_u64_half(u32::max_value() as u64) bench_u64_max(u64::max_value()) bench_i64_0(0i64) bench_i64_half(i32::max_value() as i64) bench_i64_max(i64::max_value()) bench_u16_0(0u16) bench_u16_min(u16::min_value()) bench_u16_max(u16::max_value()) bench_i16_0(0i16) bench_i16_min(i16::min_value()) bench_i16_max(i16::max_value()) bench_u128_0(0u128) bench_u128_max(u128::max_value()) bench_i8_0(0i8) bench_i8_min(i8::min_value()) bench_i8_max(i8::max_value()) bench_u8_0(0u8) bench_u8_min(u8::min_value()) bench_u8_max(u8::max_value()) bench_i32_0(0i32) bench_i32_min(i32::min_value()) bench_i32_max(i32::max_value()) bench_u32_0(0u32) bench_u32_min(u32::min_value()) bench_u32_max(u32::max_value()) } } ``` </details> The next step would be to specialize the `ToString` implementation so it doesn't go through the `Display` trait. I'm not sure if it will improve anything but I think it's worth a try. r? `@Amanieu`
rust-lang · Sep 19, 2024 · 160eac8 · 160eac8
2 parents df7f778 + 7def3a7
commit 160eac8
Showing 1 changed file with 92 additions and 46 deletions.
diff --git a/library/core/src/fmt/num.rs b/library/core/src/fmt/num.rs
@@ -208,11 +208,46 @@ static DEC_DIGITS_LUT: &[u8; 200] = b"0001020304050607080910111213141516171819\
       8081828384858687888990919293949596979899";
 
 macro_rules! impl_Display {
-    ($($t:ident),* as $u:ident via $conv_fn:ident named $name:ident) => {
+    ($($t:ident => $size:literal $(as $positive:ident in $other:ident)? => named $name:ident,)* ; as $u:ident via $conv_fn:ident named $gen_name:ident) => {
+
+        $(
+        #[stable(feature = "rust1", since = "1.0.0")]
+        impl fmt::Display for $t {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                // If it's a signed integer.
+                $(
+                    let is_nonnegative = *self >= 0;
+
+                    #[cfg(not(feature = "optimize_for_size"))]
+                    {
+                        if !is_nonnegative {
+                            // convert the negative num to positive by summing 1 to its 2s complement
+                            return $other((!self as $positive).wrapping_add(1), false, f);
+                        }
+                    }
+                    #[cfg(feature = "optimize_for_size")]
+                    {
+                        if !is_nonnegative {
+                            // convert the negative num to positive by summing 1 to its 2s complement
+                            return $other((!self.$conv_fn()).wrapping_add(1), false, f);
+                        }
+                    }
+                )?
+                // If it's a positive integer.
+                #[cfg(not(feature = "optimize_for_size"))]
+                {
+                    $name(*self, true, f)
+                }
+                #[cfg(feature = "optimize_for_size")]
+                {
+                    $gen_name(*self, true, f)
+                }
+            }
+        }
+
         #[cfg(not(feature = "optimize_for_size"))]
-        fn $name(mut n: $u, is_nonnegative: bool, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-            // 2^128 is about 3*10^38, so 39 gives an extra byte of space
-            let mut buf = [MaybeUninit::<u8>::uninit(); 39];
+        fn $name(mut n: $t, is_nonnegative: bool, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            let mut buf = [MaybeUninit::<u8>::uninit(); $size];
             let mut curr = buf.len();
             let buf_ptr = MaybeUninit::slice_as_mut_ptr(&mut buf);
             let lut_ptr = DEC_DIGITS_LUT.as_ptr();
@@ -226,22 +261,26 @@ macro_rules! impl_Display {
             // is safe to access.
             unsafe {
                 // need at least 16 bits for the 4-characters-at-a-time to work.
-                assert!(crate::mem::size_of::<$u>() >= 2);
-
-                // eagerly decode 4 characters at a time
-                while n >= 10000 {
-                    let rem = (n % 10000) as usize;
-                    n /= 10000;
-
-                    let d1 = (rem / 100) << 1;
-                    let d2 = (rem % 100) << 1;
-                    curr -= 4;
-
-                    // We are allowed to copy to `buf_ptr[curr..curr + 3]` here since
-                    // otherwise `curr < 0`. But then `n` was originally at least `10000^10`
-                    // which is `10^40 > 2^128 > n`.
-                    ptr::copy_nonoverlapping(lut_ptr.add(d1), buf_ptr.add(curr), 2);
-                    ptr::copy_nonoverlapping(lut_ptr.add(d2), buf_ptr.add(curr + 2), 2);
+                #[allow(overflowing_literals)]
+                #[allow(unused_comparisons)]
+                // This block will be removed for smaller types at compile time and in the worst
+                // case, it will prevent to have the `10000` literal to overflow for `i8` and `u8`.
+                if core::mem::size_of::<$t>() >= 2 {
+                    // eagerly decode 4 characters at a time
+                    while n >= 10000 {
+                        let rem = (n % 10000) as usize;
+                        n /= 10000;
+
+                        let d1 = (rem / 100) << 1;
+                        let d2 = (rem % 100) << 1;
+                        curr -= 4;
+
+                        // We are allowed to copy to `buf_ptr[curr..curr + 3]` here since
+                        // otherwise `curr < 0`. But then `n` was originally at least `10000^10`
+                        // which is `10^40 > 2^128 > n`.
+                        ptr::copy_nonoverlapping(lut_ptr.add(d1 as usize), buf_ptr.add(curr), 2);
+                        ptr::copy_nonoverlapping(lut_ptr.add(d2 as usize), buf_ptr.add(curr + 2), 2);
+                    }
                 }
 
                 // if we reach here numbers are <= 9999, so at most 4 chars long
@@ -255,6 +294,8 @@ macro_rules! impl_Display {
                     ptr::copy_nonoverlapping(lut_ptr.add(d1), buf_ptr.add(curr), 2);
                 }
 
+                // if we reach here numbers are <= 100, so at most 2 chars long
+                // The biggest it can be is 99, and 99 << 1 == 198, so a `u8` is enough.
                 // decode last 1 or 2 chars
                 if n < 10 {
                     curr -= 1;
@@ -273,11 +314,10 @@ macro_rules! impl_Display {
                     slice::from_raw_parts(buf_ptr.add(curr), buf.len() - curr))
             };
             f.pad_integral(is_nonnegative, "", buf_slice)
-        }
+        })*
 
         #[cfg(feature = "optimize_for_size")]
-        fn $name(mut n: $u, is_nonnegative: bool, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-            // 2^128 is about 3*10^38, so 39 gives an extra byte of space
+        fn $gen_name(mut n: $u, is_nonnegative: bool, f: &mut fmt::Formatter<'_>) -> fmt::Result {
             let mut buf = [MaybeUninit::<u8>::uninit(); 39];
             let mut curr = buf.len();
             let buf_ptr = MaybeUninit::slice_as_mut_ptr(&mut buf);
@@ -306,21 +346,6 @@ macro_rules! impl_Display {
             };
             f.pad_integral(is_nonnegative, "", buf_slice)
         }
-
-        $(#[stable(feature = "rust1", since = "1.0.0")]
-        impl fmt::Display for $t {
-            #[allow(unused_comparisons)]
-            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-                let is_nonnegative = *self >= 0;
-                let n = if is_nonnegative {
-                    self.$conv_fn()
-                } else {
-                    // convert the negative num to positive by summing 1 to it's 2 complement
-                    (!self.$conv_fn()).wrapping_add(1)
-                };
-                $name(n, is_nonnegative, f)
-            }
-        })*
     };
 }
 
@@ -374,7 +399,6 @@ macro_rules! impl_Exp {
                 (n, exponent, exponent, added_precision)
             };
 
-            // 39 digits (worst case u128) + . = 40
             // Since `curr` always decreases by the number of digits copied, this means
             // that `curr >= 0`.
             let mut buf = [MaybeUninit::<u8>::uninit(); 40];
@@ -469,7 +493,7 @@ macro_rules! impl_Exp {
                     let n = if is_nonnegative {
                         self.$conv_fn()
                     } else {
-                        // convert the negative num to positive by summing 1 to it's 2 complement
+                        // convert the negative num to positive by summing 1 to its 2s complement
                         (!self.$conv_fn()).wrapping_add(1)
                     };
                     $name(n, is_nonnegative, false, f)
@@ -484,7 +508,7 @@ macro_rules! impl_Exp {
                     let n = if is_nonnegative {
                         self.$conv_fn()
                     } else {
-                        // convert the negative num to positive by summing 1 to it's 2 complement
+                        // convert the negative num to positive by summing 1 to its 2s complement
                         (!self.$conv_fn()).wrapping_add(1)
                     };
                     $name(n, is_nonnegative, true, f)
@@ -499,8 +523,17 @@ macro_rules! impl_Exp {
 mod imp {
     use super::*;
     impl_Display!(
-        i8, u8, i16, u16, i32, u32, i64, u64, usize, isize
-            as u64 via to_u64 named fmt_u64
+        i8 => 3 as u8 in fmt_u8 => named fmt_i8,
+        u8 => 3 => named fmt_u8,
+        i16 => 5 as u16 in fmt_u16 => named fmt_i16,
+        u16 => 5 => named fmt_u16,
+        i32 => 10 as u32 in fmt_u32 => named fmt_i32,
+        u32 => 10 => named fmt_u32,
+        i64 => 19 as u64 in fmt_u64 => named fmt_i64,
+        u64 => 20 => named fmt_u64,
+        isize => 19 as usize in fmt_usize => named fmt_isize,
+        usize => 20 => named fmt_usize,
+        ; as u64 via to_u64 named fmt_u64
     );
     impl_Exp!(
         i8, u8, i16, u16, i32, u32, i64, u64, usize, isize
@@ -511,8 +544,21 @@ mod imp {
 #[cfg(not(any(target_pointer_width = "64", target_arch = "wasm32")))]
 mod imp {
     use super::*;
-    impl_Display!(i8, u8, i16, u16, i32, u32, isize, usize as u32 via to_u32 named fmt_u32);
-    impl_Display!(i64, u64 as u64 via to_u64 named fmt_u64);
+    impl_Display!(
+        i8 => 3 as u8 in fmt_u8 => named fmt_i8,
+        u8 => 3 => named fmt_u8,
+        i16 => 5 as u16 in fmt_u16 => named fmt_i16,
+        u16 => 5 => named fmt_u16,
+        i32 => 10 as u32 in fmt_u32 => named fmt_i32,
+        u32 => 10 => named fmt_u32,
+        isize => 10 as usize in fmt_usize => named fmt_isize,
+        usize => 10 => named fmt_usize,
+        ; as u32 via to_u32 named fmt_u32);
+    impl_Display!(
+        i64 => 19 as u64 in fmt_u64 => named fmt_i64,
+        u64 => 20 => named fmt_u64,
+        ; as u64 via to_u64 named fmt_u64);
+
     impl_Exp!(i8, u8, i16, u16, i32, u32, isize, usize as u32 via to_u32 named exp_u32);
     impl_Exp!(i64, u64 as u64 via to_u64 named exp_u64);
 }
@@ -619,7 +665,7 @@ impl fmt::Display for i128 {
         let n = if is_nonnegative {
             self.to_u128()
         } else {
-            // convert the negative num to positive by summing 1 to it's 2 complement
+            // convert the negative num to positive by summing 1 to its 2s complement
             (!self.to_u128()).wrapping_add(1)
         };
         fmt_u128(n, is_nonnegative, f)