gt: stash progress, Fp12 over Fp6 fails ref or opt multiexp while Fp1…

…2 over Fp4 doesn't (without Torus)
mratsim · Sep 25, 2024 · 79c3e92 · 79c3e92
1 parent 1916910
commit 79c3e92
Show file tree

Hide file tree

Showing 6 changed files with 181 additions and 36 deletions.
diff --git a/constantine/math/pairings/gt_exponentiations_vartime.nim b/constantine/math/pairings/gt_exponentiations_vartime.nim
@@ -263,8 +263,8 @@ func gtExpEndo_wNAF_vartime*[Gt: ExtensionField, scalBits: static int](
   static: doAssert window <= 4, "Window of size " & $window & " is too large and precomputation would use " & $(precompSize * sizeof(Gt)) & " stack space."
 
   # 1. Compute endomorphisms
-  const M = when Gt is Fp6:  2
-            elif Gt is Fp12: 4
+  const M = when Gt.Name.getEmbeddingDegree() == 6:  2
+            elif Gt.Name.getEmbeddingDegree() == 12: 4
             else: {.error: "Unconfigured".}
 
   var endos {.noInit.}: array[M-1, Gt]
@@ -351,9 +351,9 @@ func gtExp_vartime*[Gt: ExtensionField, scalBits: static int](
   when Gt.Name.hasEndomorphismAcceleration():
     when scalBits >= EndomorphismThreshold: # Skip static: doAssert when multiplying by intentionally small scalars.
       if usedBits >= EndomorphismThreshold:
-        when Gt is Fp6:
+        when Gt.Name.getEmbeddingDegree() == 6:
           r.gtExpEndo_wNAF_vartime(a, scalar, window = 4)
-        elif Gt is Fp12:
+        elif Gt.Name.getEmbeddingDegree() == 12:
           r.gtExpEndo_wNAF_vartime(a, scalar, window = 3)
         else: # Curves defined on Fp^m with m > 2
           {.error: "Unconfigured".}

diff --git a/constantine/math/pairings/gt_multiexp.nim b/constantine/math/pairings/gt_multiexp.nim
@@ -7,12 +7,13 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import constantine/named/algebras,
+       constantine/math/arithmetic,
        constantine/math/endomorphisms/split_scalars,
        constantine/math/extension_fields,
        constantine/math/arithmetic/bigints,
        constantine/named/zoo_endomorphisms,
        constantine/platforms/abstractions,
-       ./cyclotomic_subgroups
+       ./cyclotomic_subgroups, ./gt_prj
 
 # No exceptions allowed in core cryptographic operations
 {.push raises: [].}
@@ -96,12 +97,27 @@ func `~/=`[Gt: ExtensionField](a: var Gt, b: Gt) {.inline.} =
 func setNeutral[Gt: ExtensionField](a: var Gt) {.inline.} =
   a.setOne()
 
+func `~*=`(a: var T2Prj, b: T2Aff) {.inline.} =
+  a.mixedProd_vartime(a, b)
+
+func `~*=`(a: var T2Prj, b: T2Prj) {.inline.} =
+  a.prod(a, b)
+
+func `~/=`(a: var T2Prj, b: T2Aff) {.inline.} =
+  ## Cyclotomic division
+  var t {.noInit.}: T2Aff
+  t.cyclotomic_inv(b)
+  a ~*= t
+
 # Reference multi-exponentiation
 # -------------------------------------------------------------
+# We distinguish GtAcc (GT Accumulators) from GtElt (Gt Element)
+# They can map to the same type if using extension fields
+# or to 2 different types if using tori (affine and projective torus coordinates)
 
-func multiExpImpl_reference_vartime[bits: static int, Gt](
-       r: var Gt,
-       elems: ptr UncheckedArray[Gt],
+func multiExpImpl_reference_vartime[bits: static int, GtAcc, GtElt](
+       r: var GtAcc,
+       elems: ptr UncheckedArray[GtElt],
        expos: ptr UncheckedArray[BigInt[bits]],
        N: int, c: static int) {.tags:[VarTime, HeapAlloc].} =
   ## Inner implementation of MEXP, for static dispatch over c, the bucket bit length
@@ -112,8 +128,8 @@ func multiExpImpl_reference_vartime[bits: static int, Gt](
   const numBuckets = 1 shl c - 1 # bucket 0 is unused
   const numWindows = bits.ceilDiv_vartime(c)
 
-  let miniEXPs = allocHeapArray(Gt, numWindows)
-  let buckets = allocHeapArray(Gt, numBuckets)
+  let miniEXPs = allocHeapArray(GtAcc, numWindows)
+  let buckets = allocHeapArray(GtAcc, numBuckets)
 
   # Algorithm
   # ---------
@@ -134,7 +150,7 @@ func multiExpImpl_reference_vartime[bits: static int, Gt](
     # 2. Bucket reduction.                               Cost: 2x(2ᶜ-2) => 2 additions per 2ᶜ-1 bucket, last bucket is just copied
     # We have ordered subset sums in each bucket, we now need to compute the mini-exponentiation
     #   S₁¹ + S₂² + S₃³ + ... + (S₂c₋₁)^(2ᶜ-1)
-    var accumBuckets{.noInit.}, miniEXP{.noInit.}: Gt
+    var accumBuckets{.noInit.}, miniEXP{.noInit.}: GtAcc
     accumBuckets = buckets[numBuckets-1]
     miniEXP = buckets[numBuckets-1]
 
@@ -157,9 +173,9 @@ func multiExpImpl_reference_vartime[bits: static int, Gt](
   buckets.freeHeap()
   miniEXPs.freeHeap()
 
-func multiExp_reference_dispatch_vartime[bits: static int, Gt](
-       r: var Gt,
-       elems: ptr UncheckedArray[Gt],
+func multiExp_reference_dispatch_vartime[bits: static int, GtAcc, GtElt](
+       r: var GtAcc,
+       elems: ptr UncheckedArray[GtElt],
        expos: ptr UncheckedArray[BigInt[bits]],
        N: int) {.tags:[VarTime, HeapAlloc].} =
   ## Multiexponentiation:
@@ -190,41 +206,58 @@ func multiExp_reference_vartime*[bits: static int, Gt](
        r: var Gt,
        elems: ptr UncheckedArray[Gt],
        expos: ptr UncheckedArray[BigInt[bits]],
-       N: int) {.tags:[VarTime, HeapAlloc].} =
+       N: int, useTorus: static bool = true) {.tags:[VarTime, HeapAlloc].} =
   ## Multiexponentiation:
   ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
-  multiExp_reference_dispatch_vartime(r, elems, expos, N)
+  when useTorus:
+    static: doAssert Gt is QuadraticExt, "GT was: " & $Gt
+    type F = typeof(elems[0].c0)
+    let elemsTorus = allocHeapArrayAligned(T2Aff[F], N, alignment = 64)
+    elemsTorus.toOpenArray(0, N-1).batchFromGT_vartime(
+      elems.toOpenArray(0, N-1)
+    )
+    var r_torus {.noInit.}: T2Prj[F]
+    multiExp_reference_dispatch_vartime(r_torus, elemsTorus, expos, N)
+    r.fromTorus2_vartime(r_torus)
+  else:
+    multiExp_reference_dispatch_vartime(r, elems, expos, N)
 
-func multiExp_reference_vartime*[Gt](r: var Gt, elems: openArray[Gt], expos: openArray[BigInt]) {.tags:[VarTime, HeapAlloc].} =
+func multiExp_reference_vartime*[Gt](
+      r: var Gt,
+      elems: openArray[Gt],
+      expos: openArray[BigInt],
+      useTorus: static bool = true) {.tags:[VarTime, HeapAlloc].} =
   ## Multiexponentiation:
   ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
   debug: doAssert expos.len == elems.len
   let N = elems.len
-  multiExp_reference_dispatch_vartime(r, elems.asUnchecked(), expos.asUnchecked(), N)
+  multiExp_reference_vartime(r, elems.asUnchecked(), expos.asUnchecked(), N, useTorus)
 
 func multiExp_reference_vartime*[F, Gt](
        r: var Gt,
        elems: ptr UncheckedArray[Gt],
        expos: ptr UncheckedArray[F],
-       len: int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
+       len: int,
+       useTorus: static bool = true) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
   ## Multiexponentiation:
   ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
   let n = cast[int](len)
   let expos_big = allocHeapArrayAligned(F.getBigInt(), n, alignment = 64)
   expos_big.batchFromField(expos, n)
-  r.multiExp_reference_vartime(elems, expos_big, n)
+  r.multiExp_reference_vartime(elems, expos_big, n, useTorus)
 
   freeHeapAligned(expos_big)
 
 func multiExp_reference_vartime*[Gt](
        r: var Gt,
        elems: openArray[Gt],
-       expos: openArray[Fr]) {.tags:[VarTime, Alloca, HeapAlloc], inline.} =
+       expos: openArray[Fr],
+       useTorus: static bool = true) {.tags:[VarTime, Alloca, HeapAlloc], inline.} =
   ## Multiexponentiation:
   ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
   debug: doAssert expos.len == elems.len
   let N = elems.len
-  multiExp_reference_vartime(r, elems.asUnchecked(), expos.asUnchecked(), N)
+  multiExp_reference_vartime(r, elems.asUnchecked(), expos.asUnchecked(), N, useTorus)
 
 # ########################################################### #
 #                                                             #
@@ -233,7 +266,6 @@ func multiExp_reference_vartime*[Gt](
 #                                                             #
 # ########################################################### #
 
-
 func accumulate[GT](buckets: ptr UncheckedArray[GT], val: SecretWord, negate: SecretBool, elem: GT) {.inline, meter.} =
   let val = BaseType(val)
   if val == 0: # Skip g⁰
@@ -366,8 +398,8 @@ proc applyEndomorphism[bits: static int, GT](
   ## Returns a new triplet (endoElems, endoExpos, N)
   ## endoElems and endoExpos MUST be freed afterwards
 
-  const M = when Gt is Fp6:  2
-            elif Gt is Fp12: 4
+  const M = when Gt.Name.getEmbeddingDegree() == 6:  2
+            elif Gt.Name.getEmbeddingDegree() == 12: 4
             else: {.error: "Unconfigured".}
 
   const L = Fr[Gt.Name].bits().computeEndoRecodedLength(M)
@@ -410,8 +442,12 @@ template withEndo[exponentsBits: static int, GT](
   else:
     multiExpProc(r, elems, expos, N, c)
 
-# Algorithm selection
-# -----------------------------------------------------------------------------------------------------------------------
+# ########################################################### #
+#                                                             #
+#                 Multi-exponentiations in 𝔾ₜ                 #
+#                   Algorithm selection                       #
+#                                                             #
+# ########################################################### #
 
 func multiexp_dispatch_vartime[bits: static int, GT](
        r: var GT,

diff --git a/constantine/math/pairings/gt_prj.nim b/constantine/math/pairings/gt_prj.nim
@@ -283,8 +283,32 @@ type
 
 template x[F](a: T2Prj[F]): F = a.coords[0]
 template z[F](a: T2Prj[F]): F = a.coords[1]
+template `x=`[F](a: var T2Prj[F], v: F) = a.coords[0] = v
+template `z=`[F](a: var T2Prj[F], v: F) = a.coords[1] = v
+
+proc setNeutral*[F](a: var T2Aff[F]) =
+  # We special-case the neutral element to 0
+  # TODO: this is in case an element of the Torus might be compressed
+  # to coordinate 1
+  # TODO: Can a GT element of the form a + bv have a == 0?
+  F(a).setZero()
+
+proc isNeutral*[F](a: T2Aff[F]): SecretBool =
+  F(a).isZero()
+
+proc setNeutral*(a: var T2Prj) =
+  a.x.setOne()
+  a.z.setZero()
+
+proc isNeutral*(a: T2Prj): SecretBool =
+  a.z.isZero()
 
 proc fromGT_vartime*[F](r: var T2Aff[F], a: QuadraticExt[F]) =
+  # Special case identity element
+  if bool a.isNeutral():
+    r.setNeutral()
+    return
+
   var t {.noInit.}, one {.noInit.}: F
   t.inv_vartime(a.c1)
   one.setOne()
@@ -293,6 +317,11 @@ proc fromGT_vartime*[F](r: var T2Aff[F], a: QuadraticExt[F]) =
   F(r) *= t
 
 proc fromGT_vartime*[F](r: var T2Prj[F], a: QuadraticExt[F]) =
+  # Special case identity element
+  if bool a.isOne():
+    r.setNeutral()
+    return
+
   var t {.noInit.}: F
   t.inv_vartime(a.c1)
   r.z.setOne()
@@ -301,8 +330,13 @@ proc fromGT_vartime*[F](r: var T2Prj[F], a: QuadraticExt[F]) =
   r.x *= t
 
 proc fromTorus2_vartime*[F](r: var QuadraticExt[F], a: T2Aff[F]) =
-  var num {.noInit.}, den {.noInit.}: typeof(r)
 
+  # Special case identity element
+  if bool a.isNeutral():
+    r.setNeutral()
+    return
+
+  var num {.noInit.}, den {.noInit.}: typeof(r)
   num.c0 = F a
   num.c1.setMinusOne()
   den.c0 = F a
@@ -311,6 +345,12 @@ proc fromTorus2_vartime*[F](r: var QuadraticExt[F], a: T2Aff[F]) =
   r.prod(num, den)
 
 proc fromTorus2_vartime*[F](r: var QuadraticExt[F], a: T2Prj[F]) =
+
+  # Special case identity element
+  if bool a.isNeutral():
+    r.setOne()
+    return
+
   type QF = QuadraticExt[F]
 
   var t0 {.noInit.}, t1 {.noInit.}: QF
@@ -320,10 +360,26 @@ proc fromTorus2_vartime*[F](r: var QuadraticExt[F], a: T2Prj[F]) =
 
   r.prod(t0, t1)
 
-proc mixedProd*[F](r: var T2Prj[F], a: T2Prj[F], b: T2Aff[F]) =
+proc fromAffine_vartime*[F](r: var T2Prj[F], a: T2Aff[F]) =
+  mixin `x=`
+  if bool a.isNeutral():
+    r.setNeutral()
+  else:
+    r.coords[0] = F(a) # r.x doesn't work despite bind, mixin, exports and usual generic sandwich workarounds
+    r.z.setOne()
+
+proc mixedProd_vartime*[F](r: var T2Prj[F], a: T2Prj[F], b: T2Aff[F]) =
   ## Multiplication on a torus.
   ## b MUST be in the cyclotomic subgroup
 
+  # Special case identity element
+  if bool a.isNeutral():
+    r.fromAffine_vartime(b) # handles b == 1 as well
+    return
+  if bool b.isNeutral():
+    r = a
+    return
+
   var u0 {.noInit.}, u1 {.noInit.}: F
   u0.prod(a.x, F b)
   u1.prod(a.z, F b)
@@ -332,15 +388,29 @@ proc mixedProd*[F](r: var T2Prj[F], a: T2Prj[F], b: T2Aff[F]) =
   r.x += u0
   r.z.sum(u1, a.x)
 
-proc affineProd*[F](r: var T2Prj[F], a, b: T2Aff[F]) =
+proc affineProd_vartime*[F](r: var T2Prj[F], a, b: T2Aff[F]) =
+
+  # Special case identity element
+  if bool a.isNeutral():
+    r.fromAffine_vartime(b) # handles b == 1 as well
+    return
+  if bool b.isNeutral():
+    r.fromAffine_vartime(a)
+    return
+
   r.z.sum(F a, F b)
   r.x.prod(F a, F b)
 
   var snr {.noInit.}: typeof(r.x.c1)
   snr.setOne()
   r.x.c1 += snr
 
-proc affineSquare*[F](r: var T2Prj[F], a: T2Aff[F]) =
+proc affineSquare_vartime*[F](r: var T2Prj[F], a: T2Aff[F]) =
+
+  # Special case identity element
+  if bool a.isNeutral():
+    r.setNeutral()
+    return
 
   r.z.double(F a)
   r.x.square(F a)
@@ -357,8 +427,22 @@ proc square*[F](r: var T2Prj[F], a: T2Prj[F]) {.inline.} =
   type QF = QuadraticExt[F]
   QF(r).square(QF a)
 
+template cyclotomic_square*[F](r: var T2Prj[F], a: T2Prj[F]) =
+  # Alias
+  r.square(a)
+
+template cyclotomic_square*[F](a: var T2Prj[F]) =
+  # Alias
+  a.square(a)
+
+proc inv*[F](r: var T2Aff[F], a: T2Aff[F]) {.inline.} =
+  ## Cyclotomic inversion on a Torus
+  # Note: for neutral element this is valid only
+  # if the implementation uses 0 as special-value
+  F(r).neg(F(a))
+
 proc inv*[F](r: var T2Prj[F], a: T2Prj[F]) {.inline.} =
-  # Cyclotomic inversion on a Torus
+  ## Cyclotomic inversion on a Torus
   r.x.neg(a.x)
   r.z = a.z
 
@@ -378,6 +462,8 @@ proc batchFromGT_vartime*[F](dst: var openArray[T2Aff[F]],
   ## Note: on 𝔽p6, the ratio of inversion I/M is about 3.8
   ## so this is about a ~25% speedup
 
+  # TODO: handle neutral element
+
   debug: doAssert dst.len == src.len
 
   F(dst[0]) = src[0].c1
@@ -415,6 +501,8 @@ proc batchFromTorus2_vartime*[F](dst: var openArray[QuadraticExt[F]],
   ##
   ## Note: on 𝔽p12, the ratio of inversion I/M is about 3
   ## so this has likely no speedup, and is not trivial to parallelize
+
+  # TODO: handle neutral element
   debug: doAssert dst.len == src.len
 
   # We consciously choose to recompute conj(src[i]) to avoid an allocation
@@ -446,3 +534,16 @@ proc batchFromTorus2_vartime*[F](dst: var openArray[QuadraticExt[F]],
   var t {.noInit.}: QF
   t.conj(QF src[0])
   dst[0] *= t
+
+
+when isMainModule:
+  var a, c: QuadraticExt[Fp6[BLS12_381]]
+  var b: T2Prj[Fp6[BLS12_381]]
+
+  a.setOne()
+  b.fromGT_vartime(a)
+  c.fromTorus2_vartime(b)
+
+  echo "a: ", a.toHex(indent = 4)
+  echo "b: ", QuadraticExt[Fp6[BLS12_381]](b).toHex(indent = 4)
+  echo "c: ", c.toHex(indent = 4)