From ddaac494a62f7daa7a1166fe51b9b4180552e905 Mon Sep 17 00:00:00 2001
From: Junekey Jeon <jk_jeon@kaist.ac.kr>
Date: Tue, 8 Feb 2022 18:21:54 -0800
Subject: [PATCH 01/15] Reflect the new paper   - Change constants appearing in
 log & division computations   - Rename beta_minus_1 to beta

---
 include/fmt/format-inl.h | 153 +++++++++++++++++----------------------
 1 file changed, 66 insertions(+), 87 deletions(-)
diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h
index d0c03b43876c..9912dc1cb435 100644
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@@ -149,9 +149,6 @@ template <> FMT_FUNC int count_digits<4>(detail::fallback_uintptr n) {
   return i >= 0 ? i * char_digits + count_digits<4, unsigned>(n.value[i]) : 1;
 }
 
-// log10(2) = 0x0.4d104d427de7fbcc...
-static constexpr uint64_t log10_2_significand = 0x4d104d427de7fbcc;
-
 template <typename T = void> struct basic_impl_data {
   // Normalized 64-bit significands of pow(10, k), for k = -348, -340, ..., 340.
   // These are generated by support/compute-powers.py.
@@ -895,86 +892,72 @@ inline uint64_t umul96_lower64(uint32_t x, uint64_t y) noexcept {
 // Computes floor(log10(pow(2, e))) for e in [-1700, 1700] using the method from
 // https://fmt.dev/papers/Grisu-Exact.pdf#page=5, section 3.4.
 inline int floor_log10_pow2(int e) noexcept {
-  FMT_ASSERT(e <= 1700 && e >= -1700, "too large exponent");
+  FMT_ASSERT(e <= 2620 && e >= -2620, "too large exponent");
   static_assert((-1 >> 1) == -1, "right shift is not arithmetic");
-  const int shift = 22;
-  return (e * static_cast<int>(log10_2_significand >> (64 - shift))) >> shift;
+  return (e * 315653) >> 20;
 }
 
 // Various fast log computations.
 inline int floor_log2_pow10(int e) noexcept {
   FMT_ASSERT(e <= 1233 && e >= -1233, "too large exponent");
-  const uint64_t log2_10_integer_part = 3;
-  const uint64_t log2_10_fractional_digits = 0x5269e12f346e2bf9;
-  const int shift_amount = 19;
-  return (e * static_cast<int>(
-                  (log2_10_integer_part << shift_amount) |
-                  (log2_10_fractional_digits >> (64 - shift_amount)))) >>
-         shift_amount;
+  return (e * 1741647) >> 19;
 }
 inline int floor_log10_pow2_minus_log10_4_over_3(int e) noexcept {
-  FMT_ASSERT(e <= 1700 && e >= -1700, "too large exponent");
-  const uint64_t log10_4_over_3_fractional_digits = 0x1ffbfc2bbc780375;
-  const int shift_amount = 22;
-  return (e * static_cast<int>(log10_2_significand >> (64 - shift_amount)) -
-          static_cast<int>(log10_4_over_3_fractional_digits >>
-                           (64 - shift_amount))) >>
-         shift_amount;
+  FMT_ASSERT(e <= 2936 && e >= -2985, "too large exponent");
+  return (e * 631305 - 261663) >> 21;
 }
 
+static constexpr struct {
+  int divisor;
+  int shift_amount;
+} div_small_pow10_infos[] = {{10, 16}, {100, 16}};
+
 // Replaces n by floor(n / pow(10, N)) returning true if and only if n is
 // divisible by pow(10, N).
 // Precondition: n <= pow(10, N + 1).
 template <int N>
 bool check_divisibility_and_divide_by_pow10(uint32_t& n) noexcept {
   // The numbers below are chosen such that:
-  //   1. floor(n/d) = floor(nm / 2^(k+l)) where d=10 or d=100,
-  //   2. floor(nm/2^k) mod 2^l = 0 if and only if n is divisible by d,
-  // where m is magic_number, k is margin_bits, l is divisibility_check_bits
+  //   1. floor(n/d) = floor(nm / 2^k) where d=10 or d=100,
+  //   2. nm mod 2^k < m if and only if n is divisible by d,
+  // where m is magic_number, k is shift_amount
   // and d is divisor.
   //
   // Item 1 is a common technique of replacing division by a constant with
   // multiplication, see e.g. "Division by Invariant Integers Using
   // Multiplication" by Granlund and Montgomery (1994). magic_number (m) is set
-  // to ceil(2^(k+l)/d) for large enough k+l.
+  // to ceil(2^k/d) for large enough k.
   // The idea for item 2 originates from Schubfach.
-  static constexpr struct {
-    int divisor;
-    int margin_bits;
-    int divisibility_check_bits;
-  } infos[] = {{10, 8, 8}, {100, 10, 16}};
-  constexpr auto info = infos[N - 1];
+  constexpr auto info = div_small_pow10_infos[N - 1];
+  FMT_ASSERT(n <= info.divisor * 10, "n is too large");
   constexpr uint32_t magic_number =
-      (1 << (info.margin_bits + info.divisibility_check_bits)) / info.divisor +
-      1;
+      (1u << info.shift_amount) / info.divisor + 1;
   n *= magic_number;
-  n >>= info.margin_bits;
-  const uint32_t comparison_mask = (1u << info.divisibility_check_bits) - 1;
-  bool result = (n & comparison_mask) == 0;
-  n >>= info.divisibility_check_bits;
+  const uint32_t comparison_mask = (1u << info.shift_amount) - 1;
+  bool result = (n & comparison_mask) < magic_number;
+  n >>= info.shift_amount;
   return result;
 }
 
 // Computes floor(n / pow(10, N)) for small n and N.
 // Precondition: n <= pow(10, N + 1).
 template <int N> uint32_t small_division_by_pow10(uint32_t n) noexcept {
-  static constexpr struct {
-    uint32_t magic_number;
-    int shift_amount;
-    uint32_t divisor_times_10;
-  } infos[] = {{0xcccd, 19, 100}, {0xa3d8, 22, 1000}};
-  constexpr auto info = infos[N - 1];
-  FMT_ASSERT(n <= info.divisor_times_10, "n is too large");
-  return n * info.magic_number >> info.shift_amount;
+  constexpr auto info = div_small_pow10_infos[N - 1];
+  FMT_ASSERT(n <= info.divisor * 10, "n is too large");
+  constexpr uint32_t magic_number =
+      (1u << info.divisibility_check_bits) / info.divisor + 1;
+  return (n * magic_number) >> info.shift_amount;
 }
 
 // Computes floor(n / 10^(kappa + 1)) (float)
 inline uint32_t divide_by_10_to_kappa_plus_1(uint32_t n) noexcept {
-  return n / float_info<float>::big_divisor;
+  // 1374389535 = ceil(2^37/100)
+  return (static_cast<uint64_t>(n) * 1374389535) >> 37;
 }
 // Computes floor(n / 10^(kappa + 1)) (double)
 inline uint64_t divide_by_10_to_kappa_plus_1(uint64_t n) noexcept {
-  return umul128_upper64(n, 0x83126e978d4fdf3c) >> 9;
+  // 2361183241434822607 = ceil(2^(64+7)/1000)
+  return umul128_upper64(n, 2361183241434822607ull) >> 7;
 }
 
 // Various subroutines using pow10 cache
@@ -1034,40 +1017,39 @@ template <> struct cache_accessor<float> {
   }
 
   static uint32_t compute_delta(const cache_entry_type& cache,
-                                int beta_minus_1) noexcept {
-    return static_cast<uint32_t>(cache >> (64 - 1 - beta_minus_1));
+                                int beta) noexcept {
+    return static_cast<uint32_t>(cache >> (64 - 1 - beta));
   }
 
   static compute_mul_parity_result compute_mul_parity(
-      carrier_uint two_f, const cache_entry_type& cache,
-      int beta_minus_1) noexcept {
-    FMT_ASSERT(beta_minus_1 >= 1, "");
-    FMT_ASSERT(beta_minus_1 < 64, "");
+      carrier_uint two_f, const cache_entry_type& cache, int beta) noexcept {
+    FMT_ASSERT(beta >= 1, "");
+    FMT_ASSERT(beta < 64, "");
 
     auto r = umul96_lower64(two_f, cache);
-    return {((r >> (64 - beta_minus_1)) & 1) != 0,
-            static_cast<uint32_t>(r >> (32 - beta_minus_1)) == 0};
+    return {((r >> (64 - beta)) & 1) != 0,
+            static_cast<uint32_t>(r >> (32 - beta)) == 0};
   }
 
   static carrier_uint compute_left_endpoint_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta_minus_1) noexcept {
+      const cache_entry_type& cache, int beta) noexcept {
     return static_cast<carrier_uint>(
         (cache - (cache >> (float_info<float>::significand_bits + 2))) >>
-        (64 - float_info<float>::significand_bits - 1 - beta_minus_1));
+        (64 - float_info<float>::significand_bits - 1 - beta));
   }
 
   static carrier_uint compute_right_endpoint_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta_minus_1) noexcept {
+      const cache_entry_type& cache, int beta) noexcept {
     return static_cast<carrier_uint>(
         (cache + (cache >> (float_info<float>::significand_bits + 1))) >>
-        (64 - float_info<float>::significand_bits - 1 - beta_minus_1));
+        (64 - float_info<float>::significand_bits - 1 - beta));
   }
 
   static carrier_uint compute_round_up_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta_minus_1) noexcept {
+      const cache_entry_type& cache, int beta) noexcept {
     return (static_cast<carrier_uint>(
                 cache >>
-                (64 - float_info<float>::significand_bits - 2 - beta_minus_1)) +
+                (64 - float_info<float>::significand_bits - 2 - beta)) +
             1) /
            2;
   }
@@ -1794,40 +1776,38 @@ template <> struct cache_accessor<double> {
   }
 
   static uint32_t compute_delta(cache_entry_type const& cache,
-                                int beta_minus_1) noexcept {
-    return static_cast<uint32_t>(cache.high() >> (64 - 1 - beta_minus_1));
+                                int beta) noexcept {
+    return static_cast<uint32_t>(cache.high() >> (64 - 1 - beta));
   }
 
   static compute_mul_parity_result compute_mul_parity(
-      carrier_uint two_f, const cache_entry_type& cache,
-      int beta_minus_1) noexcept {
-    FMT_ASSERT(beta_minus_1 >= 1, "");
-    FMT_ASSERT(beta_minus_1 < 64, "");
+      carrier_uint two_f, const cache_entry_type& cache, int beta) noexcept {
+    FMT_ASSERT(beta >= 1, "");
+    FMT_ASSERT(beta < 64, "");
 
     auto r = umul192_lower128(two_f, cache);
-    return {
-        ((r.high() >> (64 - beta_minus_1)) & 1) != 0,
-        ((r.high() << beta_minus_1) | (r.low() >> (64 - beta_minus_1))) == 0};
+    return {((r.high() >> (64 - beta)) & 1) != 0,
+            ((r.high() << beta) | (r.low() >> (64 - beta))) == 0};
   }
 
   static carrier_uint compute_left_endpoint_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta_minus_1) noexcept {
+      const cache_entry_type& cache, int beta) noexcept {
     return (cache.high() -
             (cache.high() >> (float_info<double>::significand_bits + 2))) >>
-           (64 - float_info<double>::significand_bits - 1 - beta_minus_1);
+           (64 - float_info<double>::significand_bits - 1 - beta);
   }
 
   static carrier_uint compute_right_endpoint_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta_minus_1) noexcept {
+      const cache_entry_type& cache, int beta) noexcept {
     return (cache.high() +
             (cache.high() >> (float_info<double>::significand_bits + 1))) >>
-           (64 - float_info<double>::significand_bits - 1 - beta_minus_1);
+           (64 - float_info<double>::significand_bits - 1 - beta);
   }
 
   static carrier_uint compute_round_up_for_shorter_interval_case(
-      const cache_entry_type& cache, int beta_minus_1) noexcept {
+      const cache_entry_type& cache, int beta) noexcept {
     return ((cache.high() >>
-             (64 - float_info<double>::significand_bits - 2 - beta_minus_1)) +
+             (64 - float_info<double>::significand_bits - 2 - beta)) +
             1) /
            2;
   }
@@ -1962,16 +1942,16 @@ FMT_INLINE decimal_fp<T> shorter_interval_case(int exponent) noexcept {
   decimal_fp<T> ret_value;
   // Compute k and beta
   const int minus_k = floor_log10_pow2_minus_log10_4_over_3(exponent);
-  const int beta_minus_1 = exponent + floor_log2_pow10(-minus_k);
+  const int beta = exponent + floor_log2_pow10(-minus_k);
 
   // Compute xi and zi
   using cache_entry_type = typename cache_accessor<T>::cache_entry_type;
   const cache_entry_type cache = cache_accessor<T>::get_cached_power(-minus_k);
 
   auto xi = cache_accessor<T>::compute_left_endpoint_for_shorter_interval_case(
-      cache, beta_minus_1);
+      cache, beta);
   auto zi = cache_accessor<T>::compute_right_endpoint_for_shorter_interval_case(
-      cache, beta_minus_1);
+      cache, beta);
 
   // If the left endpoint is not an integer, increase it
   if (!is_left_endpoint_integer_shorter_interval<T>(exponent)) ++xi;
@@ -1988,8 +1968,8 @@ FMT_INLINE decimal_fp<T> shorter_interval_case(int exponent) noexcept {
 
   // Otherwise, compute the round-up of y
   ret_value.significand =
-      cache_accessor<T>::compute_round_up_for_shorter_interval_case(
-          cache, beta_minus_1);
+      cache_accessor<T>::compute_round_up_for_shorter_interval_case(cache,
+                                                                    beta);
   ret_value.exponent = minus_k;
 
   // When tie occurs, choose one of them according to the rule
@@ -2040,11 +2020,11 @@ template <typename T> decimal_fp<T> to_decimal(T x) noexcept {
   // Compute k and beta.
   const int minus_k = floor_log10_pow2(exponent) - float_info<T>::kappa;
   const cache_entry_type cache = cache_accessor<T>::get_cached_power(-minus_k);
-  const int beta_minus_1 = exponent + floor_log2_pow10(-minus_k);
+  const int beta = exponent + floor_log2_pow10(-minus_k);
 
   // Compute zi and deltai.
   // 10^kappa <= deltai < 10^(kappa + 1)
-  const uint32_t deltai = cache_accessor<T>::compute_delta(cache, beta_minus_1);
+  const uint32_t deltai = cache_accessor<T>::compute_delta(cache, beta);
   const carrier_uint two_fc = significand << 1;
 
   // For the case of binary32, the result of integer check is not correct for
@@ -2058,7 +2038,7 @@ template <typename T> decimal_fp<T> to_decimal(T x) noexcept {
   // Fortunately, with these inputs, that branch is never executed, so we are
   // fine.
   const typename cache_accessor<T>::compute_mul_result z_mul =
-      cache_accessor<T>::compute_mul((two_fc | 1) << beta_minus_1, cache);
+      cache_accessor<T>::compute_mul((two_fc | 1) << beta, cache);
 
   // Step 2: Try larger divisor; remove trailing zeros if necessary.
 
@@ -2090,13 +2070,12 @@ template <typename T> decimal_fp<T> to_decimal(T x) noexcept {
       // Otherwise, the inequalities on exponent ensure that
       // x is not an integer, so if z^(f) >= delta^(f) (even parity), we in fact
       // have strict inequality.
-      if (!cache_accessor<T>::compute_mul_parity(two_fl, cache, beta_minus_1)
-               .parity) {
+      if (!cache_accessor<T>::compute_mul_parity(two_fl, cache, beta).parity) {
         goto small_divisor_case_label;
       }
     } else {
       const typename cache_accessor<T>::compute_mul_parity_result x_mul =
-          cache_accessor<T>::compute_mul_parity(two_fl, cache, beta_minus_1);
+          cache_accessor<T>::compute_mul_parity(two_fl, cache, beta);
       if (!x_mul.parity && !x_mul.is_integer) {
         goto small_divisor_case_label;
       }
@@ -2133,7 +2112,7 @@ template <typename T> decimal_fp<T> to_decimal(T x) noexcept {
     // parity. Also, zi and r should have the same parity since the divisor
     // is an even number.
     const typename cache_accessor<T>::compute_mul_parity_result y_mul =
-        cache_accessor<T>::compute_mul_parity(two_fc, cache, beta_minus_1);
+        cache_accessor<T>::compute_mul_parity(two_fc, cache, beta);
 
     if (y_mul.parity != approx_y_parity) {
       --ret_value.significand;

From fc6ceeac2c547004d2f7eebd41966c03acd4941e Mon Sep 17 00:00:00 2001
From: Junekey Jeon <jk_jeon@kaist.ac.kr>
Date: Tue, 8 Feb 2022 18:23:53 -0800
Subject: [PATCH 02/15] Check r < deltai first, because that is the major
 branch chosen for short inputs

---
 include/fmt/format-inl.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h
index 9912dc1cb435..eba27f469d38 100644
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@@ -2049,15 +2049,15 @@ template <typename T> decimal_fp<T> to_decimal(T x) noexcept {
   uint32_t r = static_cast<uint32_t>(z_mul.result - float_info<T>::big_divisor *
                                                         ret_value.significand);
 
-  if (r > deltai) {
-    goto small_divisor_case_label;
-  } else if (r < deltai) {
+  if (r < deltai) {
     // Exclude the right endpoint if necessary.
     if (r == 0 && z_mul.is_integer && !include_right_endpoint) {
       --ret_value.significand;
       r = float_info<T>::big_divisor;
       goto small_divisor_case_label;
     }
+  } else if (r > deltai) {
+    goto small_divisor_case_label;
   } else {
     // r == deltai; compare fractional parts.
     const carrier_uint two_fl = two_fc - 1;

From 3b803dc03b15c0b06109225d92d705b433d99b83 Mon Sep 17 00:00:00 2001
From: Junekey Jeon <jk_jeon@kaist.ac.kr>
Date: Tue, 8 Feb 2022 18:27:53 -0800
Subject: [PATCH 03/15] Add rotr

---
 include/fmt/format-inl.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h
index eba27f469d38..d2c68548ae53 100644
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@@ -816,6 +816,16 @@ struct uint128_wrapper {
   }
 };
 
+// Compilers should be able to optimize this into the ror instruction.
+inline std::uint32_t rotr(uint32_t n, uint32_t r) noexcept {
+  r &= 31;
+  return (n >> r) | (n << (32 - r));
+}
+inline std::uint64_t rotr(uint64_t n, uint32_t r) noexcept {
+  r &= 63;
+  return (n >> r) | (n << (64 - r));
+}
+
 // Implementation of Dragonbox algorithm: https://github.com/jk-jeon/dragonbox.
 namespace dragonbox {
 // Computes 128-bit result of multiplication of two 64-bit unsigned integers.

From b2208fad6da154fde71c6a354c9edfcd5cd2f3f2 Mon Sep 17 00:00:00 2001
From: Junekey Jeon <jk_jeon@kaist.ac.kr>
Date: Tue, 8 Feb 2022 18:32:20 -0800
Subject: [PATCH 04/15] Optimize remove_trailing_zeros

---
 include/fmt/format-inl.h | 153 ++++++++++++++++-----------------------
 1 file changed, 62 insertions(+), 91 deletions(-)

diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h
index d2c68548ae53..733a3d9d9a7f 100644
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@@ -1835,115 +1835,86 @@ bool is_left_endpoint_integer_shorter_interval(int exponent) noexcept {
 
 // Remove trailing zeros from n and return the number of zeros removed (float)
 FMT_INLINE int remove_trailing_zeros(uint32_t& n) noexcept {
-#ifdef FMT_BUILTIN_CTZ
-  int t = FMT_BUILTIN_CTZ(n);
-#else
-  int t = ctz(n);
-#endif
-  if (t > float_info<float>::max_trailing_zeros)
-    t = float_info<float>::max_trailing_zeros;
-
-  const uint32_t mod_inv1 = 0xcccccccd;
-  const uint32_t max_quotient1 = 0x33333333;
-  const uint32_t mod_inv2 = 0xc28f5c29;
-  const uint32_t max_quotient2 = 0x0a3d70a3;
+  FMT_ASSERT(n != 0, "");
+  const uint32_t mod_inv_5 = 0xcccccccd;
+  const uint32_t mod_inv_25 = mod_inv_5 * mod_inv_5;
 
   int s = 0;
-  for (; s < t - 1; s += 2) {
-    if (n * mod_inv2 > max_quotient2) break;
-    n *= mod_inv2;
+  while (true) {
+    auto q = rotr(n * mod_inv_25, 2);
+    if (q <= std::numeric_limits<uint32_t>::max() / 100) {
+      n = q;
+      s += 2;
+    } else {
+      break;
+    }
   }
-  if (s < t && n * mod_inv1 <= max_quotient1) {
-    n *= mod_inv1;
-    ++s;
+  auto q = rotr(n * mod_inv_5, 1);
+  if (q <= std::numeric_limits<uint32_t>::max() / 10) {
+    n = q;
+    s |= 1;
   }
-  n >>= s;
+
   return s;
 }
 
 // Removes trailing zeros and returns the number of zeros removed (double)
 FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept {
-#ifdef FMT_BUILTIN_CTZLL
-  int t = FMT_BUILTIN_CTZLL(n);
-#else
-  int t = ctzll(n);
-#endif
-  if (t > float_info<double>::max_trailing_zeros)
-    t = float_info<double>::max_trailing_zeros;
-  // Divide by 10^8 and reduce to 32-bits
-  // Since ret_value.significand <= (2^64 - 1) / 1000 < 10^17,
-  // both of the quotient and the r should fit in 32-bits
-
-  const uint32_t mod_inv1 = 0xcccccccd;
-  const uint32_t max_quotient1 = 0x33333333;
-  const uint64_t mod_inv8 = 0xc767074b22e90e21;
-  const uint64_t max_quotient8 = 0x00002af31dc46118;
-
-  // If the number is divisible by 1'0000'0000, work with the quotient
-  if (t >= 8) {
-    auto quotient_candidate = n * mod_inv8;
-
-    if (quotient_candidate <= max_quotient8) {
-      auto quotient = static_cast<uint32_t>(quotient_candidate >> 8);
-
-      int s = 8;
-      for (; s < t; ++s) {
-        if (quotient * mod_inv1 > max_quotient1) break;
-        quotient *= mod_inv1;
+  FMT_ASSERT(n != 0, "");
+
+  // This magic number is ceil(2^90 / 10^8).
+  constexpr auto magic_number = uint64_t(12379400392853802749ull);
+  auto nm = umul128(n, magic_number);
+
+  // Is n is divisible by 10^8?
+  if ((nm.high() & ((1ull << (90 - 64)) - 1)) == 0 && nm.low() < magic_number) {
+    // If yes, work with the quotient.
+    auto n32 = static_cast<uint32_t>(nm.high() >> (90 - 64));
+
+    const uint32_t mod_inv_5 = 0xcccccccd;
+    const uint32_t mod_inv_25 = mod_inv_5 * mod_inv_5;
+
+    int s = 8;
+    while (true) {
+      auto q = rotr(n32 * mod_inv_25, 2);
+      if (q <= std::numeric_limits<uint32_t>::max() / 100) {
+        n32 = q;
+        s += 2;
+      } else {
+        break;
       }
-      quotient >>= (s - 8);
-      n = quotient;
-      return s;
     }
-  }
-
-  // Otherwise, work with the remainder
-  auto quotient = static_cast<uint32_t>(n / 100000000);
-  auto remainder = static_cast<uint32_t>(n - 100000000 * quotient);
-
-  if (t == 0 || remainder * mod_inv1 > max_quotient1) {
-    return 0;
-  }
-  remainder *= mod_inv1;
-
-  if (t == 1 || remainder * mod_inv1 > max_quotient1) {
-    n = (remainder >> 1) + quotient * 10000000ull;
-    return 1;
-  }
-  remainder *= mod_inv1;
-
-  if (t == 2 || remainder * mod_inv1 > max_quotient1) {
-    n = (remainder >> 2) + quotient * 1000000ull;
-    return 2;
-  }
-  remainder *= mod_inv1;
+    auto q = rotr(n32 * mod_inv_5, 1);
+    if (q <= std::numeric_limits<uint32_t>::max() / 10) {
+      n32 = q;
+      s |= 1;
+    }
 
-  if (t == 3 || remainder * mod_inv1 > max_quotient1) {
-    n = (remainder >> 3) + quotient * 100000ull;
-    return 3;
+    n = n32;
+    return s;
   }
-  remainder *= mod_inv1;
 
-  if (t == 4 || remainder * mod_inv1 > max_quotient1) {
-    n = (remainder >> 4) + quotient * 10000ull;
-    return 4;
-  }
-  remainder *= mod_inv1;
+  // If n is not divisible by 10^8, work with n itself.
+  const uint64_t mod_inv_5 = 0xcccccccc'cccccccd;
+  const uint64_t mod_inv_25 = mod_inv_5 * mod_inv_5;
 
-  if (t == 5 || remainder * mod_inv1 > max_quotient1) {
-    n = (remainder >> 5) + quotient * 1000ull;
-    return 5;
+  int s = 0;
+  while (true) {
+    auto q = rotr(n * mod_inv_25, 2);
+    if (q <= std::numeric_limits<uint64_t>::max() / 100) {
+      n = q;
+      s += 2;
+    } else {
+      break;
+    }
   }
-  remainder *= mod_inv1;
-
-  if (t == 6 || remainder * mod_inv1 > max_quotient1) {
-    n = (remainder >> 6) + quotient * 100ull;
-    return 6;
+  auto q = rotr(n * mod_inv_5, 1);
+  if (q <= std::numeric_limits<uint64_t>::max() / 10) {
+    n = q;
+    s |= 1;
   }
-  remainder *= mod_inv1;
 
-  n = (remainder >> 7) + quotient * 10ull;
-  return 7;
+  return s;
 }
 
 // The main algorithm for shorter interval case

From cdb0d1894067cab8fdfcf07483fe688278914b01 Mon Sep 17 00:00:00 2001
From: Junekey Jeon <jk_jeon@kaist.ac.kr>
Date: Tue, 8 Feb 2022 18:36:28 -0800
Subject: [PATCH 05/15] Recover log10_2_significand

---
 include/fmt/format-inl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h
index 733a3d9d9a7f..f35830d7ebd2 100644
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@@ -327,7 +327,8 @@ FMT_CONSTEXPR inline fp operator*(fp x, fp y) {
 FMT_CONSTEXPR inline fp get_cached_power(int min_exponent,
                                          int& pow10_exponent) {
   const int shift = 32;
-  const auto significand = static_cast<int64_t>(log10_2_significand);
+  // log10(2) = 0x0.4d104d427de7fbcc...
+  const auto significand = static_cast<int64_t>(0x4d104d427de7fbcc);
   int index = static_cast<int>(
       ((min_exponent + fp::num_significand_bits - 1) * (significand >> shift) +
        ((int64_t(1) << shift) - 1))  // ceil

From 2e5382835ceedddf86d8e6aac57fc7728306ff40 Mon Sep 17 00:00:00 2001
From: Junekey Jeon <jk_jeon@kaist.ac.kr>
Date: Tue, 8 Feb 2022 19:57:25 -0800
Subject: [PATCH 06/15] Remove literal separator to satisfy some compilers

---
 include/fmt/format-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h
index f35830d7ebd2..f83ae66042f3 100644
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@@ -1896,7 +1896,7 @@ FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept {
   }
 
   // If n is not divisible by 10^8, work with n itself.
-  const uint64_t mod_inv_5 = 0xcccccccc'cccccccd;
+  const uint64_t mod_inv_5 = 0xcccccccccccccccd;
   const uint64_t mod_inv_25 = mod_inv_5 * mod_inv_5;
 
   int s = 0;

From d758274a384dc317f89462006706934c2d31aa18 Mon Sep 17 00:00:00 2001
From: Junekey Jeon <jk_jeon@kaist.ac.kr>
Date: Tue, 8 Feb 2022 19:59:35 -0800
Subject: [PATCH 07/15] Fix typo

---
 include/fmt/format-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h
index f83ae66042f3..21b2afaf9d5b 100644
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@@ -956,7 +956,7 @@ template <int N> uint32_t small_division_by_pow10(uint32_t n) noexcept {
   constexpr auto info = div_small_pow10_infos[N - 1];
   FMT_ASSERT(n <= info.divisor * 10, "n is too large");
   constexpr uint32_t magic_number =
-      (1u << info.divisibility_check_bits) / info.divisor + 1;
+      (1u << info.shift_amount) / info.divisor + 1;
   return (n * magic_number) >> info.shift_amount;
 }
 

From a66266678ceb2cfc561d33276b507af7ae1f51b8 Mon Sep 17 00:00:00 2001
From: Junekey Jeon <jk_jeon@kaist.ac.kr>
Date: Tue, 8 Feb 2022 23:26:20 -0800
Subject: [PATCH 08/15] Fix some conversion issues

---
 include/fmt/format-inl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h
index 21b2afaf9d5b..aa130f1e3095 100644
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@@ -328,7 +328,7 @@ FMT_CONSTEXPR inline fp get_cached_power(int min_exponent,
                                          int& pow10_exponent) {
   const int shift = 32;
   // log10(2) = 0x0.4d104d427de7fbcc...
-  const auto significand = static_cast<int64_t>(0x4d104d427de7fbcc);
+  const int64_t significand = 0x4d104d427de7fbcc;
   int index = static_cast<int>(
       ((min_exponent + fp::num_significand_bits - 1) * (significand >> shift) +
        ((int64_t(1) << shift) - 1))  // ceil
@@ -963,7 +963,7 @@ template <int N> uint32_t small_division_by_pow10(uint32_t n) noexcept {
 // Computes floor(n / 10^(kappa + 1)) (float)
 inline uint32_t divide_by_10_to_kappa_plus_1(uint32_t n) noexcept {
   // 1374389535 = ceil(2^37/100)
-  return (static_cast<uint64_t>(n) * 1374389535) >> 37;
+  return static_cast<uint32_t>((static_cast<uint64_t>(n) * 1374389535) >> 37);
 }
 // Computes floor(n / 10^(kappa + 1)) (double)
 inline uint64_t divide_by_10_to_kappa_plus_1(uint64_t n) noexcept {

From a71005a1c105ba189f647fc5ec10f79fffca5e37 Mon Sep 17 00:00:00 2001
From: Junekey Jeon <jk_jeon@kaist.ac.kr>
Date: Sun, 13 Feb 2022 03:23:10 -0800
Subject: [PATCH 09/15] Remove std:: infront of uint32_t/64_t & add constexpr
 to rotr

---
 include/fmt/format-inl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h
index aa130f1e3095..43903660dbff 100644
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@@ -818,11 +818,11 @@ struct uint128_wrapper {
 };
 
 // Compilers should be able to optimize this into the ror instruction.
-inline std::uint32_t rotr(uint32_t n, uint32_t r) noexcept {
+inline constexpr uint32_t rotr(uint32_t n, uint32_t r) noexcept {
   r &= 31;
   return (n >> r) | (n << (32 - r));
 }
-inline std::uint64_t rotr(uint64_t n, uint32_t r) noexcept {
+inline constexpr uint64_t rotr(uint64_t n, uint32_t r) noexcept {
   r &= 63;
   return (n >> r) | (n << (64 - r));
 }

From fc907d296afd0104e4c7955e6d74c3c542116862 Mon Sep 17 00:00:00 2001
From: Junekey Jeon <jk_jeon@kaist.ac.kr>
Date: Sun, 13 Feb 2022 03:29:36 -0800
Subject: [PATCH 10/15] Fix wrong comment/refer to a correct reference

---
 include/fmt/format-inl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h
index 43903660dbff..055f0c67e91e 100644
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@@ -900,8 +900,8 @@ inline uint64_t umul96_lower64(uint32_t x, uint64_t y) noexcept {
   return x * y;
 }
 
-// Computes floor(log10(pow(2, e))) for e in [-1700, 1700] using the method from
-// https://fmt.dev/papers/Grisu-Exact.pdf#page=5, section 3.4.
+// Computes floor(log10(pow(2, e))) for e in [-2620, 2620] using the method from
+// https://fmt.dev/papers/Dragonbox.pdf#page=28, section 6.1.
 inline int floor_log10_pow2(int e) noexcept {
   FMT_ASSERT(e <= 2620 && e >= -2620, "too large exponent");
   static_assert((-1 >> 1) == -1, "right shift is not arithmetic");

From cba730d4b99b71f9148ac180f49669ad07edc8e1 Mon Sep 17 00:00:00 2001
From: Junekey Jeon <jk_jeon@kaist.ac.kr>
Date: Sun, 13 Feb 2022 03:31:47 -0800
Subject: [PATCH 11/15] Simplify remove_trailing_zeros

---
 include/fmt/format-inl.h | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h
index 055f0c67e91e..75575b322f34 100644
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@@ -1843,12 +1843,9 @@ FMT_INLINE int remove_trailing_zeros(uint32_t& n) noexcept {
   int s = 0;
   while (true) {
     auto q = rotr(n * mod_inv_25, 2);
-    if (q <= std::numeric_limits<uint32_t>::max() / 100) {
-      n = q;
-      s += 2;
-    } else {
-      break;
-    }
+    if (q > std::numeric_limits<uint32_t>::max() / 100) break;
+    n = q;
+    s += 2;
   }
   auto q = rotr(n * mod_inv_5, 1);
   if (q <= std::numeric_limits<uint32_t>::max() / 10) {

From 56139c375a98f9b00faa38c67df723f3c63df114 Mon Sep 17 00:00:00 2001
From: Junekey Jeon <jk_jeon@kaist.ac.kr>
Date: Sun, 13 Feb 2022 03:34:29 -0800
Subject: [PATCH 12/15] Remove some C-style casts for consistency

---
 include/fmt/format-inl.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h
index 75575b322f34..6a4b3eb6373e 100644
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@@ -839,7 +839,8 @@ inline uint128_wrapper umul128(uint64_t x, uint64_t y) noexcept {
   result.low_ = _umul128(x, y, &result.high_);
   return result;
 #else
-  const uint64_t mask = (uint64_t(1) << 32) - uint64_t(1);
+  const uint64_t mask =
+      static_cast<uint64_t>(std::numeric_limits<uint32_t>::max());
 
   uint64_t a = x >> 32;
   uint64_t b = x & mask;
@@ -882,7 +883,7 @@ inline uint128_wrapper umul192_upper128(uint64_t x,
 // Computes upper 64 bits of multiplication of a 32-bit unsigned integer and a
 // 64-bit unsigned integer.
 inline uint64_t umul96_upper64(uint32_t x, uint64_t y) noexcept {
-  return umul128_upper64(uint64_t(x) << 32, y);
+  return umul128_upper64(static_cast<uint64_t>(x) << 32, y);
 }
 
 // Computes lower 128 bits of multiplication of a 64-bit unsigned integer and a
@@ -1861,7 +1862,7 @@ FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept {
   FMT_ASSERT(n != 0, "");
 
   // This magic number is ceil(2^90 / 10^8).
-  constexpr auto magic_number = uint64_t(12379400392853802749ull);
+  constexpr uint64_t magic_number = 12379400392853802749ull;
   auto nm = umul128(n, magic_number);
 
   // Is n is divisible by 10^8?

From 96017bc0e6124ca0f9f0e84c0f49b15b17d00b49 Mon Sep 17 00:00:00 2001
From: Junekey Jeon <jk_jeon@kaist.ac.kr>
Date: Sun, 13 Feb 2022 03:36:58 -0800
Subject: [PATCH 13/15] Simplify remove_trailing_zeros

---
 include/fmt/format-inl.h | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h
index 6a4b3eb6373e..a9af399c8ab4 100644
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@@ -1876,12 +1876,9 @@ FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept {
     int s = 8;
     while (true) {
       auto q = rotr(n32 * mod_inv_25, 2);
-      if (q <= std::numeric_limits<uint32_t>::max() / 100) {
-        n32 = q;
-        s += 2;
-      } else {
-        break;
-      }
+      if (q > std::numeric_limits<uint32_t>::max() / 100) break;
+      n32 = q;
+      s += 2;
     }
     auto q = rotr(n32 * mod_inv_5, 1);
     if (q <= std::numeric_limits<uint32_t>::max() / 10) {
@@ -1900,12 +1897,9 @@ FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept {
   int s = 0;
   while (true) {
     auto q = rotr(n * mod_inv_25, 2);
-    if (q <= std::numeric_limits<uint64_t>::max() / 100) {
-      n = q;
-      s += 2;
-    } else {
-      break;
-    }
+    if (q > std::numeric_limits<uint64_t>::max() / 100) break;
+    n = q;
+    s += 2;
   }
   auto q = rotr(n * mod_inv_5, 1);
   if (q <= std::numeric_limits<uint64_t>::max() / 10) {

From b3b7b69dc4145ef314448a548d45281af99d376d Mon Sep 17 00:00:00 2001
From: Junekey Jeon <jk_jeon@kaist.ac.kr>
Date: Sun, 13 Feb 2022 03:42:01 -0800
Subject: [PATCH 14/15] Revert adding constexpr to rotr to satisfy C++11
 compilers

---
 include/fmt/format-inl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h
index a9af399c8ab4..690ca261d080 100644
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@@ -818,11 +818,11 @@ struct uint128_wrapper {
 };
 
 // Compilers should be able to optimize this into the ror instruction.
-inline constexpr uint32_t rotr(uint32_t n, uint32_t r) noexcept {
+inline uint32_t rotr(uint32_t n, uint32_t r) noexcept {
   r &= 31;
   return (n >> r) | (n << (32 - r));
 }
-inline constexpr uint64_t rotr(uint64_t n, uint32_t r) noexcept {
+inline uint64_t rotr(uint64_t n, uint32_t r) noexcept {
   r &= 63;
   return (n >> r) | (n << (64 - r));
 }

From ba752b940703d7e7af79d7960ebf8344708db1ff Mon Sep 17 00:00:00 2001
From: Junekey Jeon <jk_jeon@kaist.ac.kr>
Date: Sun, 13 Feb 2022 03:57:45 -0800
Subject: [PATCH 15/15] Add FMT_CONSTEXPR to rotr instead

---
 include/fmt/format-inl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h
index 690ca261d080..c4e03041a5e9 100644
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@@ -818,11 +818,11 @@ struct uint128_wrapper {
 };
 
 // Compilers should be able to optimize this into the ror instruction.
-inline uint32_t rotr(uint32_t n, uint32_t r) noexcept {
+FMT_CONSTEXPR inline uint32_t rotr(uint32_t n, uint32_t r) noexcept {
   r &= 31;
   return (n >> r) | (n << (32 - r));
 }
-inline uint64_t rotr(uint64_t n, uint32_t r) noexcept {
+FMT_CONSTEXPR inline uint64_t rotr(uint64_t n, uint32_t r) noexcept {
   r &= 63;
   return (n >> r) | (n << (64 - r));
 }