From 6380743fa05cccf5ea1f4b49a1eb23c736752e9d Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Sat, 22 Jun 2024 16:33:59 -0400 Subject: [PATCH 1/2] Add specialized reduction for P256 in pcurves For 32-bit x86, this reduction results in point arithmetic operations that are 25-35% faster than when using Montgomery. Sadly for 64-bit x86 it is at best about even with using Montgomery, and for Clang 64-bit it's even somewhat slower. --- src/lib/math/pcurves/info.txt | 1 + .../pcurves_secp256r1/pcurves_secp256r1.cpp | 114 +++++++++++++++++- src/lib/math/pcurves/pcurves_solinas.h | 83 +++++++++++++ 3 files changed, 194 insertions(+), 4 deletions(-) create mode 100644 src/lib/math/pcurves/pcurves_solinas.h diff --git a/src/lib/math/pcurves/info.txt b/src/lib/math/pcurves/info.txt index d07bb208f3f..730b60f62bb 100644 --- a/src/lib/math/pcurves/info.txt +++ b/src/lib/math/pcurves/info.txt @@ -19,6 +19,7 @@ pcurves.h pcurves_id.h pcurves_impl.h pcurves_util.h +pcurves_solinas.h pcurves_wrap.h pcurves_instance.h diff --git a/src/lib/math/pcurves/pcurves_secp256r1/pcurves_secp256r1.cpp b/src/lib/math/pcurves/pcurves_secp256r1/pcurves_secp256r1.cpp index d7aee4cb44d..80303a3204f 100644 --- a/src/lib/math/pcurves/pcurves_secp256r1/pcurves_secp256r1.cpp +++ b/src/lib/math/pcurves/pcurves_secp256r1/pcurves_secp256r1.cpp @@ -6,15 +6,115 @@ #include +#include #include namespace Botan::PCurve { namespace { -// clang-format off +template +class Secp256r1Rep final { + public: + static constexpr auto P = Params::P; + static constexpr size_t N = Params::N; + typedef typename Params::W W; + + // Adds 4 * P-256 to prevent underflow + static constexpr auto P256_4 = + hex_to_words("0x3fffffffc00000004000000000000000000000003fffffffffffffffffffffffc"); + + constexpr static std::array redc(const std::array& z) { + const int64_t X00 = get_uint32(z.data(), 0); + const int64_t X01 = get_uint32(z.data(), 1); + const int64_t X02 = get_uint32(z.data(), 2); + const int64_t X03 = get_uint32(z.data(), 3); + const int64_t X04 = get_uint32(z.data(), 4); + const int64_t X05 = get_uint32(z.data(), 5); + const int64_t X06 = get_uint32(z.data(), 6); + const int64_t X07 = get_uint32(z.data(), 7); + const int64_t X08 = get_uint32(z.data(), 8); + const int64_t X09 = get_uint32(z.data(), 9); + const int64_t X10 = get_uint32(z.data(), 10); + const int64_t X11 = get_uint32(z.data(), 11); + const int64_t X12 = get_uint32(z.data(), 12); + const int64_t X13 = get_uint32(z.data(), 13); + const int64_t X14 = get_uint32(z.data(), 14); + const int64_t X15 = get_uint32(z.data(), 15); + + // See SP 800-186 section G.1.2 + const int64_t S0 = P256_4[0] + X00 + X08 + X09 - (X11 + X12 + X13 + X14); + const int64_t S1 = P256_4[1] + X01 + X09 + X10 - (X12 + X13 + X14 + X15); + const int64_t S2 = P256_4[2] + X02 + X10 + X11 - (X13 + X14 + X15); + const int64_t S3 = P256_4[3] + X03 + 2 * (X11 + X12) + X13 - (X15 + X08 + X09); + const int64_t S4 = P256_4[4] + X04 + 2 * (X12 + X13) + X14 - (X09 + X10); + const int64_t S5 = P256_4[5] + X05 + 2 * (X13 + X14) + X15 - (X10 + X11); + const int64_t S6 = P256_4[6] + X06 + X13 + X14 * 3 + X15 * 2 - (X08 + X09); + const int64_t S7 = P256_4[7] + X07 + X15 * 3 + X08 - (X10 + X11 + X12 + X13); + const int64_t S8 = P256_4[8]; + + std::array r = {}; + + SolinasAccum sum(r); + + sum.accum(S0); + sum.accum(S1); + sum.accum(S2); + sum.accum(S3); + sum.accum(S4); + sum.accum(S5); + sum.accum(S6); + sum.accum(S7); + const auto S = sum.final_carry(S8); + + BOTAN_DEBUG_ASSERT(S <= 8); + + const auto correction = p256_mul_mod_256(S); + W borrow = bigint_sub2(r.data(), N, correction.data(), N); + + bigint_cnd_add(borrow, r.data(), N, P.data(), N); + + return r; + } + + constexpr static std::array one() { return std::array{1}; } + + constexpr static std::array to_rep(const std::array& x) { return x; } + + constexpr static std::array wide_to_rep(const std::array& x) { return redc(x); } + + constexpr static std::array from_rep(const std::array& z) { return z; } + + private: + // Return (i*P-256) % 2**256 + // + // Assumes i is small + constexpr static std::array p256_mul_mod_256(W i) { + static_assert(WordInfo::bits == 32 || WordInfo::bits == 64); + + // For small i, multiples of P-256 have a simple structure so it's faster to + // compute the value directly vs a (constant time) table lookup + + auto r = P; + if constexpr(WordInfo::bits == 32) { + r[7] -= i; + r[6] += i; + r[3] += i; + r[0] -= i; + } else { + const uint64_t i32 = static_cast(i) << 32; + r[3] -= i32; + r[3] += i; + r[1] += i32; + r[0] -= i; + } + return r; + } +}; + namespace secp256r1 { +// clang-format off class Params final : public EllipticCurveParameters< "FFFFFFFF00000001000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFF", "FFFFFFFF00000001000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFC", @@ -25,11 +125,17 @@ class Params final : public EllipticCurveParameters< -10> { }; -class Curve final : public EllipticCurve {}; +// clang-format on -} +#if BOTAN_MP_WORD_BITS == 32 +// Secp256r1Rep works for 64 bit also, but is at best marginally faster at least +// on compilers/CPUs tested so far +class Curve final : public EllipticCurve {}; +#else +class Curve final : public EllipticCurve {}; +#endif -// clang-format on +} // namespace secp256r1 } // namespace diff --git a/src/lib/math/pcurves/pcurves_solinas.h b/src/lib/math/pcurves/pcurves_solinas.h new file mode 100644 index 00000000000..0f3e9af4a27 --- /dev/null +++ b/src/lib/math/pcurves/pcurves_solinas.h @@ -0,0 +1,83 @@ +/* +* (C) 2024 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#ifndef BOTAN_PCURVES_SOLINAS_REDC_HELPER_H_ +#define BOTAN_PCURVES_SOLINAS_REDC_HELPER_H_ + +#include + +namespace Botan { + +/* +Helpers for modular reduction of Solinas primes, such as P-256 and P-384. + +Instead of explicitly forming the various integers and adding/subtracting them +row-by-row, we compute the entire sum in one pass, column by column. To prevent +overflow/underflow the accumulator is a signed 64-bit integer, while the various +limbs are (at least for all NIST curves aside from P-192) 32 bit integers. + +For more background on Solinas primes / Solinas reduction see + +* J. Solinas 'Generalized Mersenne Numbers' + +* NIST SP 800-186 Appendix G.1 + +* Handbook of Elliptic And Hyperelliptic Curve Cryptography ยง 10.4.3 + +*/ + +template +constexpr uint32_t get_uint32(const W xw[], size_t i) { + static_assert(WordInfo::bits == 32 || WordInfo::bits == 64); + + if constexpr(WordInfo::bits == 32) { + return xw[i]; + } else { + return static_cast(xw[i / 2] >> ((i % 2) * 32)); + } +} + +template +class SolinasAccum { + public: + static_assert(WordInfo::bits == 32 || WordInfo::bits == 64); + + static constexpr size_t N32 = N * (WordInfo::bits / 32); + + SolinasAccum(std::array& r) : m_r(r), m_S(0), m_idx(0) {} + + void accum(int64_t v) { + BOTAN_DEBUG_ASSERT(m_idx < N32); + + m_S += v; + const uint32_t r = static_cast(m_S); + m_S >>= 32; + + if constexpr(WordInfo::bits == 32) { + m_r[m_idx] = r; + } else { + m_r[m_idx / 2] |= static_cast(r) << (32 * (m_idx % 2)); + } + + m_idx += 1; + } + + W final_carry(int64_t C) { + BOTAN_DEBUG_ASSERT(m_idx == N32); + m_S += C; + BOTAN_DEBUG_ASSERT(m_S >= 0); + return static_cast(m_S); + } + + private: + std::array& m_r; + int64_t m_S; + size_t m_idx; +}; + +} // namespace Botan + +#endif From fa71e702bd4fe19ae99b6dbb6b8a0af84c069df8 Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Sun, 23 Jun 2024 22:26:55 -0400 Subject: [PATCH 2/2] Add specialized reduction for P-384 This is about 20-30% faster on both 32 and 64 bit systems --- .../pcurves_secp384r1/pcurves_secp384r1.cpp | 113 +++++++++++++++++- 1 file changed, 112 insertions(+), 1 deletion(-) diff --git a/src/lib/math/pcurves/pcurves_secp384r1/pcurves_secp384r1.cpp b/src/lib/math/pcurves/pcurves_secp384r1/pcurves_secp384r1.cpp index 6d00ee79d82..24bf5fe2556 100644 --- a/src/lib/math/pcurves/pcurves_secp384r1/pcurves_secp384r1.cpp +++ b/src/lib/math/pcurves/pcurves_secp384r1/pcurves_secp384r1.cpp @@ -6,12 +6,123 @@ #include +#include #include namespace Botan::PCurve { namespace { +template +class Secp384r1Rep final { + public: + static constexpr auto P = Params::P; + static constexpr size_t N = Params::N; + typedef typename Params::W W; + + constexpr static std::array redc(const std::array& z) { + const int64_t X00 = get_uint32(z.data(), 0); + const int64_t X01 = get_uint32(z.data(), 1); + const int64_t X02 = get_uint32(z.data(), 2); + const int64_t X03 = get_uint32(z.data(), 3); + const int64_t X04 = get_uint32(z.data(), 4); + const int64_t X05 = get_uint32(z.data(), 5); + const int64_t X06 = get_uint32(z.data(), 6); + const int64_t X07 = get_uint32(z.data(), 7); + const int64_t X08 = get_uint32(z.data(), 8); + const int64_t X09 = get_uint32(z.data(), 9); + const int64_t X10 = get_uint32(z.data(), 10); + const int64_t X11 = get_uint32(z.data(), 11); + const int64_t X12 = get_uint32(z.data(), 12); + const int64_t X13 = get_uint32(z.data(), 13); + const int64_t X14 = get_uint32(z.data(), 14); + const int64_t X15 = get_uint32(z.data(), 15); + const int64_t X16 = get_uint32(z.data(), 16); + const int64_t X17 = get_uint32(z.data(), 17); + const int64_t X18 = get_uint32(z.data(), 18); + const int64_t X19 = get_uint32(z.data(), 19); + const int64_t X20 = get_uint32(z.data(), 20); + const int64_t X21 = get_uint32(z.data(), 21); + const int64_t X22 = get_uint32(z.data(), 22); + const int64_t X23 = get_uint32(z.data(), 23); + + // One copy of P-384 is added to prevent underflow + const int64_t S0 = 0xFFFFFFFF + X00 + X12 + X20 + X21 - X23; + const int64_t S1 = 0x00000000 + X01 + X13 + X22 + X23 - X12 - X20; + const int64_t S2 = 0x00000000 + X02 + X14 + X23 - X13 - X21; + const int64_t S3 = 0xFFFFFFFF + X03 + X12 + X15 + X20 + X21 - X14 - X22 - X23; + const int64_t S4 = 0xFFFFFFFE + X04 + X12 + X13 + X16 + X20 + X21 * 2 + X22 - X15 - X23 * 2; + const int64_t S5 = 0xFFFFFFFF + X05 + X13 + X14 + X17 + X21 + X22 * 2 + X23 - X16; + const int64_t S6 = 0xFFFFFFFF + X06 + X14 + X15 + X18 + X22 + X23 * 2 - X17; + const int64_t S7 = 0xFFFFFFFF + X07 + X15 + X16 + X19 + X23 - X18; + const int64_t S8 = 0xFFFFFFFF + X08 + X16 + X17 + X20 - X19; + const int64_t S9 = 0xFFFFFFFF + X09 + X17 + X18 + X21 - X20; + const int64_t SA = 0xFFFFFFFF + X10 + X18 + X19 + X22 - X21; + const int64_t SB = 0xFFFFFFFF + X11 + X19 + X20 + X23 - X22; + + std::array r = {}; + + SolinasAccum sum(r); + + sum.accum(S0); + sum.accum(S1); + sum.accum(S2); + sum.accum(S3); + sum.accum(S4); + sum.accum(S5); + sum.accum(S6); + sum.accum(S7); + sum.accum(S8); + sum.accum(S9); + sum.accum(SA); + sum.accum(SB); + const auto S = sum.final_carry(0); + + BOTAN_DEBUG_ASSERT(S <= 4); + + const auto correction = p384_mul_mod_384(S); + W borrow = bigint_sub2(r.data(), N, correction.data(), N); + + bigint_cnd_add(borrow, r.data(), N, P.data(), N); + + return r; + } + + constexpr static std::array one() { return std::array{1}; } + + constexpr static std::array to_rep(const std::array& x) { return x; } + + constexpr static std::array wide_to_rep(const std::array& x) { return redc(x); } + + constexpr static std::array from_rep(const std::array& z) { return z; } + + private: + // Return (i*P-384) % 2**384 + // + // Assumes i is small + constexpr static std::array p384_mul_mod_384(W i) { + static_assert(WordInfo::bits == 32 || WordInfo::bits == 64); + + // For small i, multiples of P-384 have a simple structure so it's faster to + // compute the value directly vs a (constant time) table lookup + + auto r = P; + if constexpr(WordInfo::bits == 32) { + r[4] -= i; + r[3] -= i; + r[1] += i; + r[0] -= i; + } else { + const uint64_t i32 = static_cast(i) << 32; + r[2] -= i; + r[1] -= i32; + r[0] += i32; + r[0] -= i; + } + return r; + } +}; + // clang-format off namespace secp384r1 { @@ -25,7 +136,7 @@ class Params final : public EllipticCurveParameters< -12> { }; -class Curve final : public EllipticCurve {}; +class Curve final : public EllipticCurve {}; }