From ddaac494a62f7daa7a1166fe51b9b4180552e905 Mon Sep 17 00:00:00 2001 From: Junekey Jeon Date: Tue, 8 Feb 2022 18:21:54 -0800 Subject: [PATCH 01/15] Reflect the new paper - Change constants appearing in log & division computations - Rename beta_minus_1 to beta --- include/fmt/format-inl.h | 153 +++++++++++++++++---------------------- 1 file changed, 66 insertions(+), 87 deletions(-) diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index d0c03b43876c..9912dc1cb435 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -149,9 +149,6 @@ template <> FMT_FUNC int count_digits<4>(detail::fallback_uintptr n) { return i >= 0 ? i * char_digits + count_digits<4, unsigned>(n.value[i]) : 1; } -// log10(2) = 0x0.4d104d427de7fbcc... -static constexpr uint64_t log10_2_significand = 0x4d104d427de7fbcc; - template struct basic_impl_data { // Normalized 64-bit significands of pow(10, k), for k = -348, -340, ..., 340. // These are generated by support/compute-powers.py. @@ -895,86 +892,72 @@ inline uint64_t umul96_lower64(uint32_t x, uint64_t y) noexcept { // Computes floor(log10(pow(2, e))) for e in [-1700, 1700] using the method from // https://fmt.dev/papers/Grisu-Exact.pdf#page=5, section 3.4. inline int floor_log10_pow2(int e) noexcept { - FMT_ASSERT(e <= 1700 && e >= -1700, "too large exponent"); + FMT_ASSERT(e <= 2620 && e >= -2620, "too large exponent"); static_assert((-1 >> 1) == -1, "right shift is not arithmetic"); - const int shift = 22; - return (e * static_cast(log10_2_significand >> (64 - shift))) >> shift; + return (e * 315653) >> 20; } // Various fast log computations. inline int floor_log2_pow10(int e) noexcept { FMT_ASSERT(e <= 1233 && e >= -1233, "too large exponent"); - const uint64_t log2_10_integer_part = 3; - const uint64_t log2_10_fractional_digits = 0x5269e12f346e2bf9; - const int shift_amount = 19; - return (e * static_cast( - (log2_10_integer_part << shift_amount) | - (log2_10_fractional_digits >> (64 - shift_amount)))) >> - shift_amount; + return (e * 1741647) >> 19; } inline int floor_log10_pow2_minus_log10_4_over_3(int e) noexcept { - FMT_ASSERT(e <= 1700 && e >= -1700, "too large exponent"); - const uint64_t log10_4_over_3_fractional_digits = 0x1ffbfc2bbc780375; - const int shift_amount = 22; - return (e * static_cast(log10_2_significand >> (64 - shift_amount)) - - static_cast(log10_4_over_3_fractional_digits >> - (64 - shift_amount))) >> - shift_amount; + FMT_ASSERT(e <= 2936 && e >= -2985, "too large exponent"); + return (e * 631305 - 261663) >> 21; } +static constexpr struct { + int divisor; + int shift_amount; +} div_small_pow10_infos[] = {{10, 16}, {100, 16}}; + // Replaces n by floor(n / pow(10, N)) returning true if and only if n is // divisible by pow(10, N). // Precondition: n <= pow(10, N + 1). template bool check_divisibility_and_divide_by_pow10(uint32_t& n) noexcept { // The numbers below are chosen such that: - // 1. floor(n/d) = floor(nm / 2^(k+l)) where d=10 or d=100, - // 2. floor(nm/2^k) mod 2^l = 0 if and only if n is divisible by d, - // where m is magic_number, k is margin_bits, l is divisibility_check_bits + // 1. floor(n/d) = floor(nm / 2^k) where d=10 or d=100, + // 2. nm mod 2^k < m if and only if n is divisible by d, + // where m is magic_number, k is shift_amount // and d is divisor. // // Item 1 is a common technique of replacing division by a constant with // multiplication, see e.g. "Division by Invariant Integers Using // Multiplication" by Granlund and Montgomery (1994). magic_number (m) is set - // to ceil(2^(k+l)/d) for large enough k+l. + // to ceil(2^k/d) for large enough k. // The idea for item 2 originates from Schubfach. - static constexpr struct { - int divisor; - int margin_bits; - int divisibility_check_bits; - } infos[] = {{10, 8, 8}, {100, 10, 16}}; - constexpr auto info = infos[N - 1]; + constexpr auto info = div_small_pow10_infos[N - 1]; + FMT_ASSERT(n <= info.divisor * 10, "n is too large"); constexpr uint32_t magic_number = - (1 << (info.margin_bits + info.divisibility_check_bits)) / info.divisor + - 1; + (1u << info.shift_amount) / info.divisor + 1; n *= magic_number; - n >>= info.margin_bits; - const uint32_t comparison_mask = (1u << info.divisibility_check_bits) - 1; - bool result = (n & comparison_mask) == 0; - n >>= info.divisibility_check_bits; + const uint32_t comparison_mask = (1u << info.shift_amount) - 1; + bool result = (n & comparison_mask) < magic_number; + n >>= info.shift_amount; return result; } // Computes floor(n / pow(10, N)) for small n and N. // Precondition: n <= pow(10, N + 1). template uint32_t small_division_by_pow10(uint32_t n) noexcept { - static constexpr struct { - uint32_t magic_number; - int shift_amount; - uint32_t divisor_times_10; - } infos[] = {{0xcccd, 19, 100}, {0xa3d8, 22, 1000}}; - constexpr auto info = infos[N - 1]; - FMT_ASSERT(n <= info.divisor_times_10, "n is too large"); - return n * info.magic_number >> info.shift_amount; + constexpr auto info = div_small_pow10_infos[N - 1]; + FMT_ASSERT(n <= info.divisor * 10, "n is too large"); + constexpr uint32_t magic_number = + (1u << info.divisibility_check_bits) / info.divisor + 1; + return (n * magic_number) >> info.shift_amount; } // Computes floor(n / 10^(kappa + 1)) (float) inline uint32_t divide_by_10_to_kappa_plus_1(uint32_t n) noexcept { - return n / float_info::big_divisor; + // 1374389535 = ceil(2^37/100) + return (static_cast(n) * 1374389535) >> 37; } // Computes floor(n / 10^(kappa + 1)) (double) inline uint64_t divide_by_10_to_kappa_plus_1(uint64_t n) noexcept { - return umul128_upper64(n, 0x83126e978d4fdf3c) >> 9; + // 2361183241434822607 = ceil(2^(64+7)/1000) + return umul128_upper64(n, 2361183241434822607ull) >> 7; } // Various subroutines using pow10 cache @@ -1034,40 +1017,39 @@ template <> struct cache_accessor { } static uint32_t compute_delta(const cache_entry_type& cache, - int beta_minus_1) noexcept { - return static_cast(cache >> (64 - 1 - beta_minus_1)); + int beta) noexcept { + return static_cast(cache >> (64 - 1 - beta)); } static compute_mul_parity_result compute_mul_parity( - carrier_uint two_f, const cache_entry_type& cache, - int beta_minus_1) noexcept { - FMT_ASSERT(beta_minus_1 >= 1, ""); - FMT_ASSERT(beta_minus_1 < 64, ""); + carrier_uint two_f, const cache_entry_type& cache, int beta) noexcept { + FMT_ASSERT(beta >= 1, ""); + FMT_ASSERT(beta < 64, ""); auto r = umul96_lower64(two_f, cache); - return {((r >> (64 - beta_minus_1)) & 1) != 0, - static_cast(r >> (32 - beta_minus_1)) == 0}; + return {((r >> (64 - beta)) & 1) != 0, + static_cast(r >> (32 - beta)) == 0}; } static carrier_uint compute_left_endpoint_for_shorter_interval_case( - const cache_entry_type& cache, int beta_minus_1) noexcept { + const cache_entry_type& cache, int beta) noexcept { return static_cast( (cache - (cache >> (float_info::significand_bits + 2))) >> - (64 - float_info::significand_bits - 1 - beta_minus_1)); + (64 - float_info::significand_bits - 1 - beta)); } static carrier_uint compute_right_endpoint_for_shorter_interval_case( - const cache_entry_type& cache, int beta_minus_1) noexcept { + const cache_entry_type& cache, int beta) noexcept { return static_cast( (cache + (cache >> (float_info::significand_bits + 1))) >> - (64 - float_info::significand_bits - 1 - beta_minus_1)); + (64 - float_info::significand_bits - 1 - beta)); } static carrier_uint compute_round_up_for_shorter_interval_case( - const cache_entry_type& cache, int beta_minus_1) noexcept { + const cache_entry_type& cache, int beta) noexcept { return (static_cast( cache >> - (64 - float_info::significand_bits - 2 - beta_minus_1)) + + (64 - float_info::significand_bits - 2 - beta)) + 1) / 2; } @@ -1794,40 +1776,38 @@ template <> struct cache_accessor { } static uint32_t compute_delta(cache_entry_type const& cache, - int beta_minus_1) noexcept { - return static_cast(cache.high() >> (64 - 1 - beta_minus_1)); + int beta) noexcept { + return static_cast(cache.high() >> (64 - 1 - beta)); } static compute_mul_parity_result compute_mul_parity( - carrier_uint two_f, const cache_entry_type& cache, - int beta_minus_1) noexcept { - FMT_ASSERT(beta_minus_1 >= 1, ""); - FMT_ASSERT(beta_minus_1 < 64, ""); + carrier_uint two_f, const cache_entry_type& cache, int beta) noexcept { + FMT_ASSERT(beta >= 1, ""); + FMT_ASSERT(beta < 64, ""); auto r = umul192_lower128(two_f, cache); - return { - ((r.high() >> (64 - beta_minus_1)) & 1) != 0, - ((r.high() << beta_minus_1) | (r.low() >> (64 - beta_minus_1))) == 0}; + return {((r.high() >> (64 - beta)) & 1) != 0, + ((r.high() << beta) | (r.low() >> (64 - beta))) == 0}; } static carrier_uint compute_left_endpoint_for_shorter_interval_case( - const cache_entry_type& cache, int beta_minus_1) noexcept { + const cache_entry_type& cache, int beta) noexcept { return (cache.high() - (cache.high() >> (float_info::significand_bits + 2))) >> - (64 - float_info::significand_bits - 1 - beta_minus_1); + (64 - float_info::significand_bits - 1 - beta); } static carrier_uint compute_right_endpoint_for_shorter_interval_case( - const cache_entry_type& cache, int beta_minus_1) noexcept { + const cache_entry_type& cache, int beta) noexcept { return (cache.high() + (cache.high() >> (float_info::significand_bits + 1))) >> - (64 - float_info::significand_bits - 1 - beta_minus_1); + (64 - float_info::significand_bits - 1 - beta); } static carrier_uint compute_round_up_for_shorter_interval_case( - const cache_entry_type& cache, int beta_minus_1) noexcept { + const cache_entry_type& cache, int beta) noexcept { return ((cache.high() >> - (64 - float_info::significand_bits - 2 - beta_minus_1)) + + (64 - float_info::significand_bits - 2 - beta)) + 1) / 2; } @@ -1962,16 +1942,16 @@ FMT_INLINE decimal_fp shorter_interval_case(int exponent) noexcept { decimal_fp ret_value; // Compute k and beta const int minus_k = floor_log10_pow2_minus_log10_4_over_3(exponent); - const int beta_minus_1 = exponent + floor_log2_pow10(-minus_k); + const int beta = exponent + floor_log2_pow10(-minus_k); // Compute xi and zi using cache_entry_type = typename cache_accessor::cache_entry_type; const cache_entry_type cache = cache_accessor::get_cached_power(-minus_k); auto xi = cache_accessor::compute_left_endpoint_for_shorter_interval_case( - cache, beta_minus_1); + cache, beta); auto zi = cache_accessor::compute_right_endpoint_for_shorter_interval_case( - cache, beta_minus_1); + cache, beta); // If the left endpoint is not an integer, increase it if (!is_left_endpoint_integer_shorter_interval(exponent)) ++xi; @@ -1988,8 +1968,8 @@ FMT_INLINE decimal_fp shorter_interval_case(int exponent) noexcept { // Otherwise, compute the round-up of y ret_value.significand = - cache_accessor::compute_round_up_for_shorter_interval_case( - cache, beta_minus_1); + cache_accessor::compute_round_up_for_shorter_interval_case(cache, + beta); ret_value.exponent = minus_k; // When tie occurs, choose one of them according to the rule @@ -2040,11 +2020,11 @@ template decimal_fp to_decimal(T x) noexcept { // Compute k and beta. const int minus_k = floor_log10_pow2(exponent) - float_info::kappa; const cache_entry_type cache = cache_accessor::get_cached_power(-minus_k); - const int beta_minus_1 = exponent + floor_log2_pow10(-minus_k); + const int beta = exponent + floor_log2_pow10(-minus_k); // Compute zi and deltai. // 10^kappa <= deltai < 10^(kappa + 1) - const uint32_t deltai = cache_accessor::compute_delta(cache, beta_minus_1); + const uint32_t deltai = cache_accessor::compute_delta(cache, beta); const carrier_uint two_fc = significand << 1; // For the case of binary32, the result of integer check is not correct for @@ -2058,7 +2038,7 @@ template decimal_fp to_decimal(T x) noexcept { // Fortunately, with these inputs, that branch is never executed, so we are // fine. const typename cache_accessor::compute_mul_result z_mul = - cache_accessor::compute_mul((two_fc | 1) << beta_minus_1, cache); + cache_accessor::compute_mul((two_fc | 1) << beta, cache); // Step 2: Try larger divisor; remove trailing zeros if necessary. @@ -2090,13 +2070,12 @@ template decimal_fp to_decimal(T x) noexcept { // Otherwise, the inequalities on exponent ensure that // x is not an integer, so if z^(f) >= delta^(f) (even parity), we in fact // have strict inequality. - if (!cache_accessor::compute_mul_parity(two_fl, cache, beta_minus_1) - .parity) { + if (!cache_accessor::compute_mul_parity(two_fl, cache, beta).parity) { goto small_divisor_case_label; } } else { const typename cache_accessor::compute_mul_parity_result x_mul = - cache_accessor::compute_mul_parity(two_fl, cache, beta_minus_1); + cache_accessor::compute_mul_parity(two_fl, cache, beta); if (!x_mul.parity && !x_mul.is_integer) { goto small_divisor_case_label; } @@ -2133,7 +2112,7 @@ template decimal_fp to_decimal(T x) noexcept { // parity. Also, zi and r should have the same parity since the divisor // is an even number. const typename cache_accessor::compute_mul_parity_result y_mul = - cache_accessor::compute_mul_parity(two_fc, cache, beta_minus_1); + cache_accessor::compute_mul_parity(two_fc, cache, beta); if (y_mul.parity != approx_y_parity) { --ret_value.significand; From fc6ceeac2c547004d2f7eebd41966c03acd4941e Mon Sep 17 00:00:00 2001 From: Junekey Jeon Date: Tue, 8 Feb 2022 18:23:53 -0800 Subject: [PATCH 02/15] Check r < deltai first, because that is the major branch chosen for short inputs --- include/fmt/format-inl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index 9912dc1cb435..eba27f469d38 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -2049,15 +2049,15 @@ template decimal_fp to_decimal(T x) noexcept { uint32_t r = static_cast(z_mul.result - float_info::big_divisor * ret_value.significand); - if (r > deltai) { - goto small_divisor_case_label; - } else if (r < deltai) { + if (r < deltai) { // Exclude the right endpoint if necessary. if (r == 0 && z_mul.is_integer && !include_right_endpoint) { --ret_value.significand; r = float_info::big_divisor; goto small_divisor_case_label; } + } else if (r > deltai) { + goto small_divisor_case_label; } else { // r == deltai; compare fractional parts. const carrier_uint two_fl = two_fc - 1; From 3b803dc03b15c0b06109225d92d705b433d99b83 Mon Sep 17 00:00:00 2001 From: Junekey Jeon Date: Tue, 8 Feb 2022 18:27:53 -0800 Subject: [PATCH 03/15] Add rotr --- include/fmt/format-inl.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index eba27f469d38..d2c68548ae53 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -816,6 +816,16 @@ struct uint128_wrapper { } }; +// Compilers should be able to optimize this into the ror instruction. +inline std::uint32_t rotr(uint32_t n, uint32_t r) noexcept { + r &= 31; + return (n >> r) | (n << (32 - r)); +} +inline std::uint64_t rotr(uint64_t n, uint32_t r) noexcept { + r &= 63; + return (n >> r) | (n << (64 - r)); +} + // Implementation of Dragonbox algorithm: https://github.com/jk-jeon/dragonbox. namespace dragonbox { // Computes 128-bit result of multiplication of two 64-bit unsigned integers. From b2208fad6da154fde71c6a354c9edfcd5cd2f3f2 Mon Sep 17 00:00:00 2001 From: Junekey Jeon Date: Tue, 8 Feb 2022 18:32:20 -0800 Subject: [PATCH 04/15] Optimize remove_trailing_zeros --- include/fmt/format-inl.h | 153 ++++++++++++++++----------------------- 1 file changed, 62 insertions(+), 91 deletions(-) diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index d2c68548ae53..733a3d9d9a7f 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -1835,115 +1835,86 @@ bool is_left_endpoint_integer_shorter_interval(int exponent) noexcept { // Remove trailing zeros from n and return the number of zeros removed (float) FMT_INLINE int remove_trailing_zeros(uint32_t& n) noexcept { -#ifdef FMT_BUILTIN_CTZ - int t = FMT_BUILTIN_CTZ(n); -#else - int t = ctz(n); -#endif - if (t > float_info::max_trailing_zeros) - t = float_info::max_trailing_zeros; - - const uint32_t mod_inv1 = 0xcccccccd; - const uint32_t max_quotient1 = 0x33333333; - const uint32_t mod_inv2 = 0xc28f5c29; - const uint32_t max_quotient2 = 0x0a3d70a3; + FMT_ASSERT(n != 0, ""); + const uint32_t mod_inv_5 = 0xcccccccd; + const uint32_t mod_inv_25 = mod_inv_5 * mod_inv_5; int s = 0; - for (; s < t - 1; s += 2) { - if (n * mod_inv2 > max_quotient2) break; - n *= mod_inv2; + while (true) { + auto q = rotr(n * mod_inv_25, 2); + if (q <= std::numeric_limits::max() / 100) { + n = q; + s += 2; + } else { + break; + } } - if (s < t && n * mod_inv1 <= max_quotient1) { - n *= mod_inv1; - ++s; + auto q = rotr(n * mod_inv_5, 1); + if (q <= std::numeric_limits::max() / 10) { + n = q; + s |= 1; } - n >>= s; + return s; } // Removes trailing zeros and returns the number of zeros removed (double) FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept { -#ifdef FMT_BUILTIN_CTZLL - int t = FMT_BUILTIN_CTZLL(n); -#else - int t = ctzll(n); -#endif - if (t > float_info::max_trailing_zeros) - t = float_info::max_trailing_zeros; - // Divide by 10^8 and reduce to 32-bits - // Since ret_value.significand <= (2^64 - 1) / 1000 < 10^17, - // both of the quotient and the r should fit in 32-bits - - const uint32_t mod_inv1 = 0xcccccccd; - const uint32_t max_quotient1 = 0x33333333; - const uint64_t mod_inv8 = 0xc767074b22e90e21; - const uint64_t max_quotient8 = 0x00002af31dc46118; - - // If the number is divisible by 1'0000'0000, work with the quotient - if (t >= 8) { - auto quotient_candidate = n * mod_inv8; - - if (quotient_candidate <= max_quotient8) { - auto quotient = static_cast(quotient_candidate >> 8); - - int s = 8; - for (; s < t; ++s) { - if (quotient * mod_inv1 > max_quotient1) break; - quotient *= mod_inv1; + FMT_ASSERT(n != 0, ""); + + // This magic number is ceil(2^90 / 10^8). + constexpr auto magic_number = uint64_t(12379400392853802749ull); + auto nm = umul128(n, magic_number); + + // Is n is divisible by 10^8? + if ((nm.high() & ((1ull << (90 - 64)) - 1)) == 0 && nm.low() < magic_number) { + // If yes, work with the quotient. + auto n32 = static_cast(nm.high() >> (90 - 64)); + + const uint32_t mod_inv_5 = 0xcccccccd; + const uint32_t mod_inv_25 = mod_inv_5 * mod_inv_5; + + int s = 8; + while (true) { + auto q = rotr(n32 * mod_inv_25, 2); + if (q <= std::numeric_limits::max() / 100) { + n32 = q; + s += 2; + } else { + break; } - quotient >>= (s - 8); - n = quotient; - return s; } - } - - // Otherwise, work with the remainder - auto quotient = static_cast(n / 100000000); - auto remainder = static_cast(n - 100000000 * quotient); - - if (t == 0 || remainder * mod_inv1 > max_quotient1) { - return 0; - } - remainder *= mod_inv1; - - if (t == 1 || remainder * mod_inv1 > max_quotient1) { - n = (remainder >> 1) + quotient * 10000000ull; - return 1; - } - remainder *= mod_inv1; - - if (t == 2 || remainder * mod_inv1 > max_quotient1) { - n = (remainder >> 2) + quotient * 1000000ull; - return 2; - } - remainder *= mod_inv1; + auto q = rotr(n32 * mod_inv_5, 1); + if (q <= std::numeric_limits::max() / 10) { + n32 = q; + s |= 1; + } - if (t == 3 || remainder * mod_inv1 > max_quotient1) { - n = (remainder >> 3) + quotient * 100000ull; - return 3; + n = n32; + return s; } - remainder *= mod_inv1; - if (t == 4 || remainder * mod_inv1 > max_quotient1) { - n = (remainder >> 4) + quotient * 10000ull; - return 4; - } - remainder *= mod_inv1; + // If n is not divisible by 10^8, work with n itself. + const uint64_t mod_inv_5 = 0xcccccccc'cccccccd; + const uint64_t mod_inv_25 = mod_inv_5 * mod_inv_5; - if (t == 5 || remainder * mod_inv1 > max_quotient1) { - n = (remainder >> 5) + quotient * 1000ull; - return 5; + int s = 0; + while (true) { + auto q = rotr(n * mod_inv_25, 2); + if (q <= std::numeric_limits::max() / 100) { + n = q; + s += 2; + } else { + break; + } } - remainder *= mod_inv1; - - if (t == 6 || remainder * mod_inv1 > max_quotient1) { - n = (remainder >> 6) + quotient * 100ull; - return 6; + auto q = rotr(n * mod_inv_5, 1); + if (q <= std::numeric_limits::max() / 10) { + n = q; + s |= 1; } - remainder *= mod_inv1; - n = (remainder >> 7) + quotient * 10ull; - return 7; + return s; } // The main algorithm for shorter interval case From cdb0d1894067cab8fdfcf07483fe688278914b01 Mon Sep 17 00:00:00 2001 From: Junekey Jeon Date: Tue, 8 Feb 2022 18:36:28 -0800 Subject: [PATCH 05/15] Recover log10_2_significand --- include/fmt/format-inl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index 733a3d9d9a7f..f35830d7ebd2 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -327,7 +327,8 @@ FMT_CONSTEXPR inline fp operator*(fp x, fp y) { FMT_CONSTEXPR inline fp get_cached_power(int min_exponent, int& pow10_exponent) { const int shift = 32; - const auto significand = static_cast(log10_2_significand); + // log10(2) = 0x0.4d104d427de7fbcc... + const auto significand = static_cast(0x4d104d427de7fbcc); int index = static_cast( ((min_exponent + fp::num_significand_bits - 1) * (significand >> shift) + ((int64_t(1) << shift) - 1)) // ceil From 2e5382835ceedddf86d8e6aac57fc7728306ff40 Mon Sep 17 00:00:00 2001 From: Junekey Jeon Date: Tue, 8 Feb 2022 19:57:25 -0800 Subject: [PATCH 06/15] Remove literal separator to satisfy some compilers --- include/fmt/format-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index f35830d7ebd2..f83ae66042f3 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -1896,7 +1896,7 @@ FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept { } // If n is not divisible by 10^8, work with n itself. - const uint64_t mod_inv_5 = 0xcccccccc'cccccccd; + const uint64_t mod_inv_5 = 0xcccccccccccccccd; const uint64_t mod_inv_25 = mod_inv_5 * mod_inv_5; int s = 0; From d758274a384dc317f89462006706934c2d31aa18 Mon Sep 17 00:00:00 2001 From: Junekey Jeon Date: Tue, 8 Feb 2022 19:59:35 -0800 Subject: [PATCH 07/15] Fix typo --- include/fmt/format-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index f83ae66042f3..21b2afaf9d5b 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -956,7 +956,7 @@ template uint32_t small_division_by_pow10(uint32_t n) noexcept { constexpr auto info = div_small_pow10_infos[N - 1]; FMT_ASSERT(n <= info.divisor * 10, "n is too large"); constexpr uint32_t magic_number = - (1u << info.divisibility_check_bits) / info.divisor + 1; + (1u << info.shift_amount) / info.divisor + 1; return (n * magic_number) >> info.shift_amount; } From a66266678ceb2cfc561d33276b507af7ae1f51b8 Mon Sep 17 00:00:00 2001 From: Junekey Jeon Date: Tue, 8 Feb 2022 23:26:20 -0800 Subject: [PATCH 08/15] Fix some conversion issues --- include/fmt/format-inl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index 21b2afaf9d5b..aa130f1e3095 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -328,7 +328,7 @@ FMT_CONSTEXPR inline fp get_cached_power(int min_exponent, int& pow10_exponent) { const int shift = 32; // log10(2) = 0x0.4d104d427de7fbcc... - const auto significand = static_cast(0x4d104d427de7fbcc); + const int64_t significand = 0x4d104d427de7fbcc; int index = static_cast( ((min_exponent + fp::num_significand_bits - 1) * (significand >> shift) + ((int64_t(1) << shift) - 1)) // ceil @@ -963,7 +963,7 @@ template uint32_t small_division_by_pow10(uint32_t n) noexcept { // Computes floor(n / 10^(kappa + 1)) (float) inline uint32_t divide_by_10_to_kappa_plus_1(uint32_t n) noexcept { // 1374389535 = ceil(2^37/100) - return (static_cast(n) * 1374389535) >> 37; + return static_cast((static_cast(n) * 1374389535) >> 37); } // Computes floor(n / 10^(kappa + 1)) (double) inline uint64_t divide_by_10_to_kappa_plus_1(uint64_t n) noexcept { From a71005a1c105ba189f647fc5ec10f79fffca5e37 Mon Sep 17 00:00:00 2001 From: Junekey Jeon Date: Sun, 13 Feb 2022 03:23:10 -0800 Subject: [PATCH 09/15] Remove std:: infront of uint32_t/64_t & add constexpr to rotr --- include/fmt/format-inl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index aa130f1e3095..43903660dbff 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -818,11 +818,11 @@ struct uint128_wrapper { }; // Compilers should be able to optimize this into the ror instruction. -inline std::uint32_t rotr(uint32_t n, uint32_t r) noexcept { +inline constexpr uint32_t rotr(uint32_t n, uint32_t r) noexcept { r &= 31; return (n >> r) | (n << (32 - r)); } -inline std::uint64_t rotr(uint64_t n, uint32_t r) noexcept { +inline constexpr uint64_t rotr(uint64_t n, uint32_t r) noexcept { r &= 63; return (n >> r) | (n << (64 - r)); } From fc907d296afd0104e4c7955e6d74c3c542116862 Mon Sep 17 00:00:00 2001 From: Junekey Jeon Date: Sun, 13 Feb 2022 03:29:36 -0800 Subject: [PATCH 10/15] Fix wrong comment/refer to a correct reference --- include/fmt/format-inl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index 43903660dbff..055f0c67e91e 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -900,8 +900,8 @@ inline uint64_t umul96_lower64(uint32_t x, uint64_t y) noexcept { return x * y; } -// Computes floor(log10(pow(2, e))) for e in [-1700, 1700] using the method from -// https://fmt.dev/papers/Grisu-Exact.pdf#page=5, section 3.4. +// Computes floor(log10(pow(2, e))) for e in [-2620, 2620] using the method from +// https://fmt.dev/papers/Dragonbox.pdf#page=28, section 6.1. inline int floor_log10_pow2(int e) noexcept { FMT_ASSERT(e <= 2620 && e >= -2620, "too large exponent"); static_assert((-1 >> 1) == -1, "right shift is not arithmetic"); From cba730d4b99b71f9148ac180f49669ad07edc8e1 Mon Sep 17 00:00:00 2001 From: Junekey Jeon Date: Sun, 13 Feb 2022 03:31:47 -0800 Subject: [PATCH 11/15] Simplify remove_trailing_zeros --- include/fmt/format-inl.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index 055f0c67e91e..75575b322f34 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -1843,12 +1843,9 @@ FMT_INLINE int remove_trailing_zeros(uint32_t& n) noexcept { int s = 0; while (true) { auto q = rotr(n * mod_inv_25, 2); - if (q <= std::numeric_limits::max() / 100) { - n = q; - s += 2; - } else { - break; - } + if (q > std::numeric_limits::max() / 100) break; + n = q; + s += 2; } auto q = rotr(n * mod_inv_5, 1); if (q <= std::numeric_limits::max() / 10) { From 56139c375a98f9b00faa38c67df723f3c63df114 Mon Sep 17 00:00:00 2001 From: Junekey Jeon Date: Sun, 13 Feb 2022 03:34:29 -0800 Subject: [PATCH 12/15] Remove some C-style casts for consistency --- include/fmt/format-inl.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index 75575b322f34..6a4b3eb6373e 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -839,7 +839,8 @@ inline uint128_wrapper umul128(uint64_t x, uint64_t y) noexcept { result.low_ = _umul128(x, y, &result.high_); return result; #else - const uint64_t mask = (uint64_t(1) << 32) - uint64_t(1); + const uint64_t mask = + static_cast(std::numeric_limits::max()); uint64_t a = x >> 32; uint64_t b = x & mask; @@ -882,7 +883,7 @@ inline uint128_wrapper umul192_upper128(uint64_t x, // Computes upper 64 bits of multiplication of a 32-bit unsigned integer and a // 64-bit unsigned integer. inline uint64_t umul96_upper64(uint32_t x, uint64_t y) noexcept { - return umul128_upper64(uint64_t(x) << 32, y); + return umul128_upper64(static_cast(x) << 32, y); } // Computes lower 128 bits of multiplication of a 64-bit unsigned integer and a @@ -1861,7 +1862,7 @@ FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept { FMT_ASSERT(n != 0, ""); // This magic number is ceil(2^90 / 10^8). - constexpr auto magic_number = uint64_t(12379400392853802749ull); + constexpr uint64_t magic_number = 12379400392853802749ull; auto nm = umul128(n, magic_number); // Is n is divisible by 10^8? From 96017bc0e6124ca0f9f0e84c0f49b15b17d00b49 Mon Sep 17 00:00:00 2001 From: Junekey Jeon Date: Sun, 13 Feb 2022 03:36:58 -0800 Subject: [PATCH 13/15] Simplify remove_trailing_zeros --- include/fmt/format-inl.h | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index 6a4b3eb6373e..a9af399c8ab4 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -1876,12 +1876,9 @@ FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept { int s = 8; while (true) { auto q = rotr(n32 * mod_inv_25, 2); - if (q <= std::numeric_limits::max() / 100) { - n32 = q; - s += 2; - } else { - break; - } + if (q > std::numeric_limits::max() / 100) break; + n32 = q; + s += 2; } auto q = rotr(n32 * mod_inv_5, 1); if (q <= std::numeric_limits::max() / 10) { @@ -1900,12 +1897,9 @@ FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept { int s = 0; while (true) { auto q = rotr(n * mod_inv_25, 2); - if (q <= std::numeric_limits::max() / 100) { - n = q; - s += 2; - } else { - break; - } + if (q > std::numeric_limits::max() / 100) break; + n = q; + s += 2; } auto q = rotr(n * mod_inv_5, 1); if (q <= std::numeric_limits::max() / 10) { From b3b7b69dc4145ef314448a548d45281af99d376d Mon Sep 17 00:00:00 2001 From: Junekey Jeon Date: Sun, 13 Feb 2022 03:42:01 -0800 Subject: [PATCH 14/15] Revert adding constexpr to rotr to satisfy C++11 compilers --- include/fmt/format-inl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index a9af399c8ab4..690ca261d080 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -818,11 +818,11 @@ struct uint128_wrapper { }; // Compilers should be able to optimize this into the ror instruction. -inline constexpr uint32_t rotr(uint32_t n, uint32_t r) noexcept { +inline uint32_t rotr(uint32_t n, uint32_t r) noexcept { r &= 31; return (n >> r) | (n << (32 - r)); } -inline constexpr uint64_t rotr(uint64_t n, uint32_t r) noexcept { +inline uint64_t rotr(uint64_t n, uint32_t r) noexcept { r &= 63; return (n >> r) | (n << (64 - r)); } From ba752b940703d7e7af79d7960ebf8344708db1ff Mon Sep 17 00:00:00 2001 From: Junekey Jeon Date: Sun, 13 Feb 2022 03:57:45 -0800 Subject: [PATCH 15/15] Add FMT_CONSTEXPR to rotr instead --- include/fmt/format-inl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index 690ca261d080..c4e03041a5e9 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -818,11 +818,11 @@ struct uint128_wrapper { }; // Compilers should be able to optimize this into the ror instruction. -inline uint32_t rotr(uint32_t n, uint32_t r) noexcept { +FMT_CONSTEXPR inline uint32_t rotr(uint32_t n, uint32_t r) noexcept { r &= 31; return (n >> r) | (n << (32 - r)); } -inline uint64_t rotr(uint64_t n, uint32_t r) noexcept { +FMT_CONSTEXPR inline uint64_t rotr(uint64_t n, uint32_t r) noexcept { r &= 63; return (n >> r) | (n << (64 - r)); }