gt: batch conversion

mratsim · Sep 2, 2024 · 9068cbc · 9068cbc
1 parent 3b5f316
commit 9068cbc
Show file tree

Hide file tree

Showing 3 changed files with 191 additions and 0 deletions.
diff --git a/benchmarks/bench_gt_prj.nim b/benchmarks/bench_gt_prj.nim
@@ -17,6 +17,7 @@ import
   ./bench_blueprint
 
 const Iters = 100_000
+const BatchIters = 1_000
 const AvailableCurves = [
   BLS12_381,
   # BN254_Snarks
@@ -125,6 +126,43 @@ proc gtFromTorus(C: static Algebra, iters: int) =
   bench("𝔾ₜ <- T₂(𝔽p6) conversion", Quad[Fp6[C]], iters):
     r.fromTorus2_vartime(t)
 
+proc torusFromGtMultiNaive(C: static Algebra, batchSize, iters: int) =
+  var r = newSeq[T2Aff[Fp6[C]]](batchSize)
+  var xx = newSeq[Quad[Fp6[C]]](batchSize)
+  for x in xx.mitems():
+    x = rng.random_gt(Quad[Fp6[C]])
+  bench("T₂(𝔽p6) <- 𝔾ₜ multi-conversion naive - " & $batchSize, Quad[Fp6[C]], iters):
+    for i in 0 ..< batchSize:
+      r[i].fromGT_vartime(xx[i])
+
+proc torusFromGtMultiBatch(C: static Algebra, batchSize, iters: int) =
+  var r = newSeq[T2Aff[Fp6[C]]](batchSize)
+  var xx = newSeq[Quad[Fp6[C]]](batchSize)
+  for x in xx.mitems():
+    x = rng.random_gt(Quad[Fp6[C]])
+  bench("T₂(𝔽p6) <- 𝔾ₜ multi-conversion batched - " & $batchSize, Quad[Fp6[C]], iters):
+    r.batchFromGT_vartime(xx)
+
+proc gtFromTorus2MultiNaive(C: static Algebra, batchSize, iters: int) =
+  var tt = newSeq[T2Prj[Fp6[C]]](batchSize)
+  var aa = newSeq[Quad[Fp6[C]]](batchSize)
+  for a in aa.mitems():
+    a = rng.random_gt(Quad[Fp6[C]])
+  for i in 0 ..< batchSize:
+    tt[i].fromGT_vartime(aa[i])
+  bench("𝔾ₜ <- T₂(𝔽p6) multi-conversion naive - " & $batchSize, Quad[Fp6[C]], iters):
+    aa.batchfromTorus2_vartime(tt)
+
+proc gtFromTorus2MultiBatch(C: static Algebra, batchSize, iters: int) =
+  var tt = newSeq[T2Aff[Fp6[C]]](batchSize)
+  var aa = newSeq[Quad[Fp6[C]]](batchSize)
+  for a in aa.mitems():
+    a = rng.random_gt(Quad[Fp6[C]])
+  tt.batchFromGT_vartime(aa)
+  bench("𝔾ₜ <- T₂(𝔽p6) multi-conversion batched - " & $batchSize, Quad[Fp6[C]], iters):
+    for i in 0 ..< batchSize:
+      aa[i].fromTorus2_vartime(tt[i])
+
 proc mulT2_aff(C: static Algebra, iters: int) =
   let a = rng.random_gt(Quad[Fp6[C]])
   let b = rng.random_gt(Quad[Fp6[C]])
@@ -204,6 +242,12 @@ proc main() =
     separator()
     torusFromGt(curve, Iters)
     gtFromTorus(curve, Iters)
+    separator()
+    torusFromGtMultiNaive(curve, batchSize = 256, BatchIters)
+    torusFromGtMultiBatch(curve, batchSize = 256, BatchIters)
+    gtFromTorus2MultiNaive(curve, batchSize = 256, BatchIters)
+    gtFromTorus2MultiBatch(curve, batchSize = 256, BatchIters)
+    separator()
     mulT2_aff(curve, Iters)
     mulT2_mix(curve, Iters)
     mulT2_prj(curve, Iters)

diff --git a/constantine/math/pairings/gt_prj.nim b/constantine/math/pairings/gt_prj.nim
@@ -361,3 +361,88 @@ proc inv*[F](r: var T2Prj[F], a: T2Prj[F]) {.inline.} =
   # Cyclotomic inversion on a Torus
   r.x.neg(a.x)
   r.z = a.z
+
+# Batched conversions
+# -------------------
+
+proc batchFromGT_vartime*[F](dst: var openArray[T2Aff[F]],
+                             src: openArray[QuadraticExt[F]]) =
+  ## Batch conversion to Torus
+  ##
+  ## This requires all `src` to be different from 0.
+  ## This is always true for elements in 𝔾ₜ.
+  ##
+  ## This replaces all inversions but one (on 𝔽p6 for 𝔾ₜ in 𝔽p12)
+  ## by 3 multiplications.
+  ##
+  ## Note: on 𝔽p6, the ratio of inversion I/M is about 3.8
+  ## so this is about a ~25% speedup
+
+  debug: doAssert dst.len == src.len
+
+  F(dst[0]) = src[0].c1
+  for i in 1 ..< dst.len:
+    F(dst[i]).prod(F dst[i-1], src[i].c1)
+
+  var accInv {.noInit.}: F
+  accInv.inv_vartime(F dst[dst.len-1])
+
+  for i in countdown(dst.len-1, 1):
+    # Compute inverse
+    F(dst[i]).prod(accInv, F dst[i-1])
+    # Next iteration
+    accInv *= src[i].c1
+
+  F(dst[0]) = accInv
+
+  var minusOne {.noInit.}: F
+  minusOne.setMinusOne()
+
+  for i in 0 ..< dst.len:
+    var t {.noInit.}: F
+    t.diff(minusOne, src[i].c0)
+    F(dst[i]) *= t
+
+proc batchFromTorus2_vartime*[F](dst: var openArray[QuadraticExt[F]],
+                                 src: openArray[T2Prj[F]]) =
+  ## Batch conversion to 𝔾ₜ
+  ##
+  ## This requires all `src` to be different from 0.
+  ## This is always true for elements in 𝔾ₜ.
+  ##
+  ## This replaces all inversions but one (on 𝔽p12 for 𝔾ₜ in 𝔽p12)
+  ## by 3 multiplications.
+  ##
+  ## Note: on 𝔽p12, the ratio of inversion I/M is about 3
+  ## so this has likely no speedup, and is not trivial to parallelize
+  debug: doAssert dst.len == src.len
+
+  # We consciously choose to recompute conj(src[i]) to avoid an allocation
+  # On BLS12-381, src[i] elements are 12*48 bytes = 576 bytes
+  type QF = QuadraticExt[F]
+
+  dst[0].conj(QF src[0])
+  for i in 1 ..< dst.len:
+    var ti {.noInit.}: QF
+    ti.conj(QF src[i])
+    dst[i].prod(dst[i-1], ti)
+
+  var accInv{.noInit.}: QF
+  accInv.inv(dst[dst.len-1])
+
+  for i in countdown(dst.len-1, 1):
+    # Compute inverse
+    dst[i].prod(accInv, dst[i-1])
+    # Conjugate it
+    dst[i].conj()
+    # Next iteration
+    var ti {.noInit.}: QF
+    ti.conj(QF src[i])
+    accInv *= ti
+    # Finalize conversion
+    dst[i] *= ti
+
+  dst[0].conj(accInv)
+  var t {.noInit.}: QF
+  t.conj(QF src[0])
+  dst[0] *= t
diff --git a/tests/math_pairings/t_gt_prj.nim b/tests/math_pairings/t_gt_prj.nim
@@ -32,6 +32,8 @@ echo "𝔾ₜ projective", " xoshiro512** seed: ", seed
 
 
 const Fp6iters = 10
+const BatchIters = 1
+const BatchSize = 256
 
 suite "𝔽p6 projective over 𝔽p2":
   test "Select check from Magma":
@@ -292,3 +294,63 @@ suite "Torus-based Cryptography for 𝔾ₜ, T₂(𝔽p6) compression":
     test(BN254_Nogami)
     # test(BN254_Snarks)
     test(BLS12_381)
+
+  # ====================================================================================
+
+  test "Batch conversion: T₂(𝔽p6) <- 𝔾ₜ":
+    proc test(Name: static Algebra) =
+      for i in 0 ..< BatchIters:
+        type F6 = Fp6[Name]
+        type MyFp12 = QuadraticExt[F6] # Even if we choose to Fp2 -> Fp4 -> Fp12
+                                              # we want this test to pass
+
+        var aa = newSeq[MyFp12](BatchSize)
+        for a in aa.mitems():
+          a = rng.random_gt(MyFp12)
+
+        var r_batch = newSeq[T2Aff[F6]](BatchSize)
+        var r_expected = newSeq[T2Aff[F6]](BatchSize)
+
+        for i in 0 ..< BatchSize:
+          r_expected[i].fromGT_vartime(aa[i])
+
+        r_batch.batchFromGT_vartime(aa)
+
+        for i in 0 ..< BatchSize:
+          doAssert bool(F6(r_batch[i]) == F6(r_expected[i])), block:
+            "\niteration " & $i & ":\n" &
+            "  found: " & F6(r_batch[i]).toHex(indent = 12) & "\n" &
+            "  expected: " & F6(r_expected[i]).toHex(indent = 12) & "\n"
+
+    test(BN254_Nogami)
+    # test(BN254_Snarks)
+    test(BLS12_381)
+
+  test "Batch conversion: 𝔾ₜ <- T₂(𝔽p6)":
+    proc test(Name: static Algebra) =
+      for i in 0 ..< BatchIters:
+        type F6 = Fp6[Name]
+        type MyFp12 = QuadraticExt[F6] # Even if we choose to Fp2 -> Fp4 -> Fp12
+                                              # we want this test to pass
+
+        var aa = newSeq[MyFp12](BatchSize)
+        for a in aa.mitems():
+          a = rng.random_gt(MyFp12)
+
+        var t2s = newSeq[T2Prj[F6]](BatchSize)
+
+        for i in 0 ..< BatchSize:
+          t2s[i].fromGT_vartime(aa[i])
+
+        var aa_batch = newSeq[MyFp12](BatchSize)
+        aa_batch.batchFromTorus2_vartime(t2s)
+
+        for i in 0 ..< BatchSize:
+          doAssert bool(aa[i] == aa_batch[i]), block:
+            "\niteration " & $i & ":\n" &
+            "  found: " & aa_batch[i].toHex(indent = 12) & "\n" &
+            "  expected: " & aa[i].toHex(indent = 12) & "\n"
+
+    test(BN254_Nogami)
+    # test(BN254_Snarks)
+    test(BLS12_381)