From 6380743fa05cccf5ea1f4b49a1eb23c736752e9d Mon Sep 17 00:00:00 2001
From: Jack Lloyd <jack@randombit.net>
Date: Sat, 22 Jun 2024 16:33:59 -0400
Subject: [PATCH 1/2] Add specialized reduction for P256 in pcurves

For 32-bit x86, this reduction results in point arithmetic operations
that are 25-35% faster than when using Montgomery.

Sadly for 64-bit x86 it is at best about even with using Montgomery,
and for Clang 64-bit it's even somewhat slower.
---
 src/lib/math/pcurves/info.txt                 |   1 +
 .../pcurves_secp256r1/pcurves_secp256r1.cpp   | 114 +++++++++++++++++-
 src/lib/math/pcurves/pcurves_solinas.h        |  83 +++++++++++++
 3 files changed, 194 insertions(+), 4 deletions(-)
 create mode 100644 src/lib/math/pcurves/pcurves_solinas.h
diff --git a/src/lib/math/pcurves/info.txt b/src/lib/math/pcurves/info.txt
index d07bb208f3f..730b60f62bb 100644
--- a/src/lib/math/pcurves/info.txt
+++ b/src/lib/math/pcurves/info.txt
@@ -19,6 +19,7 @@ pcurves.h
 pcurves_id.h
 pcurves_impl.h
 pcurves_util.h
+pcurves_solinas.h
 pcurves_wrap.h
 pcurves_instance.h
 </header:internal>
diff --git a/src/lib/math/pcurves/pcurves_secp256r1/pcurves_secp256r1.cpp b/src/lib/math/pcurves/pcurves_secp256r1/pcurves_secp256r1.cpp
index d7aee4cb44d..80303a3204f 100644
--- a/src/lib/math/pcurves/pcurves_secp256r1/pcurves_secp256r1.cpp
+++ b/src/lib/math/pcurves/pcurves_secp256r1/pcurves_secp256r1.cpp
@@ -6,15 +6,115 @@
 
 #include <botan/internal/pcurves_instance.h>
 
+#include <botan/internal/pcurves_solinas.h>
 #include <botan/internal/pcurves_wrap.h>
 
 namespace Botan::PCurve {
 
 namespace {
 
-// clang-format off
+template <typename Params>
+class Secp256r1Rep final {
+   public:
+      static constexpr auto P = Params::P;
+      static constexpr size_t N = Params::N;
+      typedef typename Params::W W;
+
+      // Adds 4 * P-256 to prevent underflow
+      static constexpr auto P256_4 =
+         hex_to_words<uint32_t>("0x3fffffffc00000004000000000000000000000003fffffffffffffffffffffffc");
+
+      constexpr static std::array<W, N> redc(const std::array<W, 2 * N>& z) {
+         const int64_t X00 = get_uint32(z.data(), 0);
+         const int64_t X01 = get_uint32(z.data(), 1);
+         const int64_t X02 = get_uint32(z.data(), 2);
+         const int64_t X03 = get_uint32(z.data(), 3);
+         const int64_t X04 = get_uint32(z.data(), 4);
+         const int64_t X05 = get_uint32(z.data(), 5);
+         const int64_t X06 = get_uint32(z.data(), 6);
+         const int64_t X07 = get_uint32(z.data(), 7);
+         const int64_t X08 = get_uint32(z.data(), 8);
+         const int64_t X09 = get_uint32(z.data(), 9);
+         const int64_t X10 = get_uint32(z.data(), 10);
+         const int64_t X11 = get_uint32(z.data(), 11);
+         const int64_t X12 = get_uint32(z.data(), 12);
+         const int64_t X13 = get_uint32(z.data(), 13);
+         const int64_t X14 = get_uint32(z.data(), 14);
+         const int64_t X15 = get_uint32(z.data(), 15);
+
+         // See SP 800-186 section G.1.2
+         const int64_t S0 = P256_4[0] + X00 + X08 + X09 - (X11 + X12 + X13 + X14);
+         const int64_t S1 = P256_4[1] + X01 + X09 + X10 - (X12 + X13 + X14 + X15);
+         const int64_t S2 = P256_4[2] + X02 + X10 + X11 - (X13 + X14 + X15);
+         const int64_t S3 = P256_4[3] + X03 + 2 * (X11 + X12) + X13 - (X15 + X08 + X09);
+         const int64_t S4 = P256_4[4] + X04 + 2 * (X12 + X13) + X14 - (X09 + X10);
+         const int64_t S5 = P256_4[5] + X05 + 2 * (X13 + X14) + X15 - (X10 + X11);
+         const int64_t S6 = P256_4[6] + X06 + X13 + X14 * 3 + X15 * 2 - (X08 + X09);
+         const int64_t S7 = P256_4[7] + X07 + X15 * 3 + X08 - (X10 + X11 + X12 + X13);
+         const int64_t S8 = P256_4[8];
+
+         std::array<W, N> r = {};
+
+         SolinasAccum sum(r);
+
+         sum.accum(S0);
+         sum.accum(S1);
+         sum.accum(S2);
+         sum.accum(S3);
+         sum.accum(S4);
+         sum.accum(S5);
+         sum.accum(S6);
+         sum.accum(S7);
+         const auto S = sum.final_carry(S8);
+
+         BOTAN_DEBUG_ASSERT(S <= 8);
+
+         const auto correction = p256_mul_mod_256(S);
+         W borrow = bigint_sub2(r.data(), N, correction.data(), N);
+
+         bigint_cnd_add(borrow, r.data(), N, P.data(), N);
+
+         return r;
+      }
+
+      constexpr static std::array<W, N> one() { return std::array<W, N>{1}; }
+
+      constexpr static std::array<W, N> to_rep(const std::array<W, N>& x) { return x; }
+
+      constexpr static std::array<W, N> wide_to_rep(const std::array<W, 2 * N>& x) { return redc(x); }
+
+      constexpr static std::array<W, N> from_rep(const std::array<W, N>& z) { return z; }
+
+   private:
+      // Return (i*P-256) % 2**256
+      //
+      // Assumes i is small
+      constexpr static std::array<W, N> p256_mul_mod_256(W i) {
+         static_assert(WordInfo<W>::bits == 32 || WordInfo<W>::bits == 64);
+
+         // For small i, multiples of P-256 have a simple structure so it's faster to
+         // compute the value directly vs a (constant time) table lookup
+
+         auto r = P;
+         if constexpr(WordInfo<W>::bits == 32) {
+            r[7] -= i;
+            r[6] += i;
+            r[3] += i;
+            r[0] -= i;
+         } else {
+            const uint64_t i32 = static_cast<uint64_t>(i) << 32;
+            r[3] -= i32;
+            r[3] += i;
+            r[1] += i32;
+            r[0] -= i;
+         }
+         return r;
+      }
+};
+
 namespace secp256r1 {
 
+// clang-format off
 class Params final : public EllipticCurveParameters<
    "FFFFFFFF00000001000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFF",
    "FFFFFFFF00000001000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFC",
@@ -25,11 +125,17 @@ class Params final : public EllipticCurveParameters<
    -10> {
 };
 
-class Curve final : public EllipticCurve<Params> {};
+// clang-format on
 
-}
+#if BOTAN_MP_WORD_BITS == 32
+// Secp256r1Rep works for 64 bit also, but is at best marginally faster at least
+// on compilers/CPUs tested so far
+class Curve final : public EllipticCurve<Params, Secp256r1Rep> {};
+#else
+class Curve final : public EllipticCurve<Params> {};
+#endif
 
-// clang-format on
+}  // namespace secp256r1
 
 }  // namespace
 
diff --git a/src/lib/math/pcurves/pcurves_solinas.h b/src/lib/math/pcurves/pcurves_solinas.h
new file mode 100644
index 00000000000..0f3e9af4a27
--- /dev/null
+++ b/src/lib/math/pcurves/pcurves_solinas.h
@@ -0,0 +1,83 @@
+/*
+* (C) 2024 Jack Lloyd
+*
+* Botan is released under the Simplified BSD License (see license.txt)
+*/
+
+#ifndef BOTAN_PCURVES_SOLINAS_REDC_HELPER_H_
+#define BOTAN_PCURVES_SOLINAS_REDC_HELPER_H_
+
+#include <botan/internal/mp_core.h>
+
+namespace Botan {
+
+/*
+Helpers for modular reduction of Solinas primes, such as P-256 and P-384.
+
+Instead of explicitly forming the various integers and adding/subtracting them
+row-by-row, we compute the entire sum in one pass, column by column. To prevent
+overflow/underflow the accumulator is a signed 64-bit integer, while the various
+limbs are (at least for all NIST curves aside from P-192) 32 bit integers.
+
+For more background on Solinas primes / Solinas reduction see
+
+* J. Solinas 'Generalized Mersenne Numbers'
+  <https://cacr.uwaterloo.ca/techreports/1999/corr99-39.pdf>
+* NIST SP 800-186 Appendix G.1
+  <https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-186.pdf>
+* Handbook of Elliptic And Hyperelliptic Curve Cryptography § 10.4.3
+
+*/
+
+template <WordType W>
+constexpr uint32_t get_uint32(const W xw[], size_t i) {
+   static_assert(WordInfo<W>::bits == 32 || WordInfo<W>::bits == 64);
+
+   if constexpr(WordInfo<W>::bits == 32) {
+      return xw[i];
+   } else {
+      return static_cast<uint32_t>(xw[i / 2] >> ((i % 2) * 32));
+   }
+}
+
+template <WordType W, size_t N>
+class SolinasAccum {
+   public:
+      static_assert(WordInfo<W>::bits == 32 || WordInfo<W>::bits == 64);
+
+      static constexpr size_t N32 = N * (WordInfo<W>::bits / 32);
+
+      SolinasAccum(std::array<W, N>& r) : m_r(r), m_S(0), m_idx(0) {}
+
+      void accum(int64_t v) {
+         BOTAN_DEBUG_ASSERT(m_idx < N32);
+
+         m_S += v;
+         const uint32_t r = static_cast<uint32_t>(m_S);
+         m_S >>= 32;
+
+         if constexpr(WordInfo<W>::bits == 32) {
+            m_r[m_idx] = r;
+         } else {
+            m_r[m_idx / 2] |= static_cast<uint64_t>(r) << (32 * (m_idx % 2));
+         }
+
+         m_idx += 1;
+      }
+
+      W final_carry(int64_t C) {
+         BOTAN_DEBUG_ASSERT(m_idx == N32);
+         m_S += C;
+         BOTAN_DEBUG_ASSERT(m_S >= 0);
+         return static_cast<W>(m_S);
+      }
+
+   private:
+      std::array<W, N>& m_r;
+      int64_t m_S;
+      size_t m_idx;
+};
+
+}  // namespace Botan
+
+#endif

From fa71e702bd4fe19ae99b6dbb6b8a0af84c069df8 Mon Sep 17 00:00:00 2001
From: Jack Lloyd <jack@randombit.net>
Date: Sun, 23 Jun 2024 22:26:55 -0400
Subject: [PATCH 2/2] Add specialized reduction for P-384

This is about 20-30% faster on both 32 and 64 bit systems
---
 .../pcurves_secp384r1/pcurves_secp384r1.cpp   | 113 +++++++++++++++++-
 1 file changed, 112 insertions(+), 1 deletion(-)

diff --git a/src/lib/math/pcurves/pcurves_secp384r1/pcurves_secp384r1.cpp b/src/lib/math/pcurves/pcurves_secp384r1/pcurves_secp384r1.cpp
index 6d00ee79d82..24bf5fe2556 100644
--- a/src/lib/math/pcurves/pcurves_secp384r1/pcurves_secp384r1.cpp
+++ b/src/lib/math/pcurves/pcurves_secp384r1/pcurves_secp384r1.cpp
@@ -6,12 +6,123 @@
 
 #include <botan/internal/pcurves_instance.h>
 
+#include <botan/internal/pcurves_solinas.h>
 #include <botan/internal/pcurves_wrap.h>
 
 namespace Botan::PCurve {
 
 namespace {
 
+template <typename Params>
+class Secp384r1Rep final {
+   public:
+      static constexpr auto P = Params::P;
+      static constexpr size_t N = Params::N;
+      typedef typename Params::W W;
+
+      constexpr static std::array<W, N> redc(const std::array<W, 2 * N>& z) {
+         const int64_t X00 = get_uint32(z.data(), 0);
+         const int64_t X01 = get_uint32(z.data(), 1);
+         const int64_t X02 = get_uint32(z.data(), 2);
+         const int64_t X03 = get_uint32(z.data(), 3);
+         const int64_t X04 = get_uint32(z.data(), 4);
+         const int64_t X05 = get_uint32(z.data(), 5);
+         const int64_t X06 = get_uint32(z.data(), 6);
+         const int64_t X07 = get_uint32(z.data(), 7);
+         const int64_t X08 = get_uint32(z.data(), 8);
+         const int64_t X09 = get_uint32(z.data(), 9);
+         const int64_t X10 = get_uint32(z.data(), 10);
+         const int64_t X11 = get_uint32(z.data(), 11);
+         const int64_t X12 = get_uint32(z.data(), 12);
+         const int64_t X13 = get_uint32(z.data(), 13);
+         const int64_t X14 = get_uint32(z.data(), 14);
+         const int64_t X15 = get_uint32(z.data(), 15);
+         const int64_t X16 = get_uint32(z.data(), 16);
+         const int64_t X17 = get_uint32(z.data(), 17);
+         const int64_t X18 = get_uint32(z.data(), 18);
+         const int64_t X19 = get_uint32(z.data(), 19);
+         const int64_t X20 = get_uint32(z.data(), 20);
+         const int64_t X21 = get_uint32(z.data(), 21);
+         const int64_t X22 = get_uint32(z.data(), 22);
+         const int64_t X23 = get_uint32(z.data(), 23);
+
+         // One copy of P-384 is added to prevent underflow
+         const int64_t S0 = 0xFFFFFFFF + X00 + X12 + X20 + X21 - X23;
+         const int64_t S1 = 0x00000000 + X01 + X13 + X22 + X23 - X12 - X20;
+         const int64_t S2 = 0x00000000 + X02 + X14 + X23 - X13 - X21;
+         const int64_t S3 = 0xFFFFFFFF + X03 + X12 + X15 + X20 + X21 - X14 - X22 - X23;
+         const int64_t S4 = 0xFFFFFFFE + X04 + X12 + X13 + X16 + X20 + X21 * 2 + X22 - X15 - X23 * 2;
+         const int64_t S5 = 0xFFFFFFFF + X05 + X13 + X14 + X17 + X21 + X22 * 2 + X23 - X16;
+         const int64_t S6 = 0xFFFFFFFF + X06 + X14 + X15 + X18 + X22 + X23 * 2 - X17;
+         const int64_t S7 = 0xFFFFFFFF + X07 + X15 + X16 + X19 + X23 - X18;
+         const int64_t S8 = 0xFFFFFFFF + X08 + X16 + X17 + X20 - X19;
+         const int64_t S9 = 0xFFFFFFFF + X09 + X17 + X18 + X21 - X20;
+         const int64_t SA = 0xFFFFFFFF + X10 + X18 + X19 + X22 - X21;
+         const int64_t SB = 0xFFFFFFFF + X11 + X19 + X20 + X23 - X22;
+
+         std::array<W, N> r = {};
+
+         SolinasAccum sum(r);
+
+         sum.accum(S0);
+         sum.accum(S1);
+         sum.accum(S2);
+         sum.accum(S3);
+         sum.accum(S4);
+         sum.accum(S5);
+         sum.accum(S6);
+         sum.accum(S7);
+         sum.accum(S8);
+         sum.accum(S9);
+         sum.accum(SA);
+         sum.accum(SB);
+         const auto S = sum.final_carry(0);
+
+         BOTAN_DEBUG_ASSERT(S <= 4);
+
+         const auto correction = p384_mul_mod_384(S);
+         W borrow = bigint_sub2(r.data(), N, correction.data(), N);
+
+         bigint_cnd_add(borrow, r.data(), N, P.data(), N);
+
+         return r;
+      }
+
+      constexpr static std::array<W, N> one() { return std::array<W, N>{1}; }
+
+      constexpr static std::array<W, N> to_rep(const std::array<W, N>& x) { return x; }
+
+      constexpr static std::array<W, N> wide_to_rep(const std::array<W, 2 * N>& x) { return redc(x); }
+
+      constexpr static std::array<W, N> from_rep(const std::array<W, N>& z) { return z; }
+
+   private:
+      // Return (i*P-384) % 2**384
+      //
+      // Assumes i is small
+      constexpr static std::array<W, N> p384_mul_mod_384(W i) {
+         static_assert(WordInfo<W>::bits == 32 || WordInfo<W>::bits == 64);
+
+         // For small i, multiples of P-384 have a simple structure so it's faster to
+         // compute the value directly vs a (constant time) table lookup
+
+         auto r = P;
+         if constexpr(WordInfo<W>::bits == 32) {
+            r[4] -= i;
+            r[3] -= i;
+            r[1] += i;
+            r[0] -= i;
+         } else {
+            const uint64_t i32 = static_cast<uint64_t>(i) << 32;
+            r[2] -= i;
+            r[1] -= i32;
+            r[0] += i32;
+            r[0] -= i;
+         }
+         return r;
+      }
+};
+
 // clang-format off
 namespace secp384r1 {
 
@@ -25,7 +136,7 @@ class Params final : public EllipticCurveParameters<
    -12> {
 };
 
-class Curve final : public EllipticCurve<Params> {};
+class Curve final : public EllipticCurve<Params, Secp384r1Rep> {};
 
 }