feat(gt-multiexp): add baseline multi-exponentiation on 𝔾ₜ based on B…

…DLO12
mratsim · Jul 16, 2024 · 8fb61c6 · 8fb61c6
1 parent 4f31dcb
commit 8fb61c6
Show file tree

Hide file tree

Showing 20 changed files with 563 additions and 40 deletions.
diff --git a/benchmarks/bench_ec_msm_bandersnatch.nimcfg b/benchmarks/bench_ec_msm_bandersnatch.nimcfg
diff --git a/benchmarks/bench_ec_msm_bls12_381_g1.nim.cfg b/benchmarks/bench_ec_msm_bls12_381_g1.nim.cfg
diff --git a/benchmarks/bench_ec_msm_bls12_381_g2.nim.cfg b/benchmarks/bench_ec_msm_bls12_381_g2.nim.cfg
diff --git a/benchmarks/bench_ec_msm_bn254_snarks_g1.nim.cfg b/benchmarks/bench_ec_msm_bn254_snarks_g1.nim.cfg
diff --git a/benchmarks/bench_ec_msm_pasta.nim.cfg b/benchmarks/bench_ec_msm_pasta.nim.cfg
diff --git a/benchmarks/bench_elliptic_parallel_template.nim b/benchmarks/bench_elliptic_parallel_template.nim
@@ -115,15 +115,15 @@ proc msmParallelBench*[EC](ctx: var BenchMsmContext[EC], numInputs: int, iters:
   var startNaive, stopNaive, startMSMbaseline, stopMSMbaseline, startMSMopt, stopMSMopt, startMSMpara, stopMSMpara: MonoTime
 
   if numInputs <= 100000:
-    startNaive = getMonotime()
+    # startNaive = getMonotime()
     bench("EC scalar muls                " & align($numInputs, 10) & " (" & $bits & "-bit coefs, points)", EC, iters):
       var tmp: EC
       r.setNeutral()
       for i in 0 ..< points.len:
         tmp.fromAffine(points[i])
         tmp.scalarMul(coefs[i])
         r += tmp
-    stopNaive = getMonotime()
+    # stopNaive = getMonotime()
 
   if numInputs <= 100000:
     startNaive = getMonotime()

diff --git a/benchmarks/bench_ec_g1_batch.nim.cfg → .../bench_elliptic_parallel_template.nim.cfg b/benchmarks/bench_ec_g1_batch.nim.cfg → .../bench_elliptic_parallel_template.nim.cfg
diff --git a/benchmarks/bench_gt.nim b/benchmarks/bench_gt.nim
@@ -33,7 +33,6 @@ proc main() =
   separator()
   staticFor i, 0, AvailableCurves.len:
     const curve = AvailableCurves[i]
-    const bits = Fr[curve].bits()
     separator()
     mulBench(Fp12[curve], Iters)
     sqrBench(Fp12[curve], Iters)

diff --git a/benchmarks/bench_gt_multiexp_bls12_381.nim b/benchmarks/bench_gt_multiexp_bls12_381.nim
@@ -0,0 +1,46 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  constantine/named/algebras,
+  constantine/math/extension_fields,
+  # Helpers
+  ./bench_gt_parallel_template
+
+# ############################################################
+#
+#               Benchmark of the 𝔾ₜ group of
+#                  Pairing Friendly curves
+#
+# ############################################################
+
+const Iters = 10000
+const AvailableCurves = [
+  # BN254_Nogami,
+  # BN254_Snarks,
+  # BLS12_377,
+  BLS12_381,
+]
+
+const testNumPoints = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+
+proc main() =
+  separator()
+  staticFor i, 0, AvailableCurves.len:
+    const curve = AvailableCurves[i]
+    var ctx = createBenchMultiExpContext(Fp12[curve], testNumPoints)
+    separator()
+    for numPoints in testNumPoints:
+      let batchIters = max(1, Iters div numPoints)
+      ctx.multiExpParallelBench(numPoints, batchIters)
+      separator()
+    separator()
+
+main()
+notes()
diff --git a/benchmarks/bench_gt_multiexp_bls12_381.nim.cfg b/benchmarks/bench_gt_multiexp_bls12_381.nim.cfg
@@ -0,0 +1 @@
+--threads:on
diff --git a/benchmarks/bench_gt_parallel_template.nim b/benchmarks/bench_gt_parallel_template.nim
@@ -0,0 +1,162 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#             Summary of the performance of a curve
+#
+# ############################################################
+
+import
+  # Standard library
+  std/[monotimes, times],
+  # Internals
+  constantine/platforms/abstractions,
+  constantine/named/algebras,
+  constantine/math/[arithmetic, extension_fields],
+  constantine/math/pairings/[
+    pairings_generic,
+    gt_exponentiations,
+    gt_exponentiations_vartime,
+    gt_multiexp
+  ],
+  constantine/threadpool,
+  # Helpers
+  helpers/prng_unsafe,
+  ./bench_blueprint
+
+export times, monotimes
+export notes
+export abstractions
+proc separator*() = separator(168)
+
+proc report(op, domain: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
+  let ns = inNanoseconds((stop-start) div iters)
+  let throughput = 1e9 / float64(ns)
+  when SupportsGetTicks:
+    echo &"{op:<68} {domain:<20} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
+  else:
+    echo &"{op:<68} {domain:<20} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
+
+macro fixFieldDisplay(T: typedesc): untyped =
+  # At compile-time, enums are integers and their display is buggy
+  # we get the Curve ID instead of the curve name.
+  let instantiated = T.getTypeInst()
+  var name = $instantiated[1][0] # Fp
+  name.add "[" & $Algebra(instantiated[1][1].intVal) & "]"
+  result = newLit name
+
+func fixDisplay(T: typedesc): string =
+  when T is (Fp or Fp2 or Fp4 or Fp6 or Fp12):
+    fixFieldDisplay(T)
+  else:
+    $T
+
+func fixDisplay(T: Algebra): string =
+  $T
+
+template bench(op: string, T: typed, iters: int, body: untyped): untyped =
+  measure(iters, startTime, stopTime, startClk, stopClk, body)
+  report(op, fixDisplay(T), startTime, stopTime, startClk, stopClk, iters)
+
+func random_gt*(rng: var RngState, F: typedesc): F {.inline, noInit.} =
+  result = rng.random_unsafe(F)
+  result.finalExp()
+
+# Multi-exponentiations
+# ---------------------------------------------------------------------------
+
+type BenchMultiexpContext*[GT] = object
+  tp: Threadpool
+  numInputs: int
+  elems: seq[GT]
+  exponents: seq[getBigInt(GT.Name, kScalarField)]
+
+proc createBenchMultiExpContext*(GT: typedesc, inputSizes: openArray[int]): BenchMultiexpContext[GT] =
+  result.tp = Threadpool.new()
+  let maxNumInputs = inputSizes.max()
+
+  const bits = Fr[GT.Name].bits()
+
+  result.numInputs = maxNumInputs
+  result.elems = newSeq[GT](maxNumInputs)
+  result.exponents = newSeq[BigInt[bits]](maxNumInputs)
+
+  proc genElemExponentPairsChunk[GT](rngSeed: uint64, start, len: int, elems: ptr GT, exponents: ptr BigInt[bits]) {.nimcall.} =
+    let elems = cast[ptr UncheckedArray[GT]](elems)
+    let exponents = cast[ptr UncheckedArray[BigInt[bits]]](exponents)
+
+    # RNGs are not threadsafe, create a threadlocal one seeded from the global RNG
+    var threadRng: RngState
+    threadRng.seed(rngSeed)
+
+    for i in start ..< start + len:
+      elems[i] = threadRng.random_gt(GT)
+      exponents[i] = threadRng.random_unsafe(BigInt[bits])
+
+  let chunks = balancedChunksPrioNumber(0, maxNumInputs, result.tp.numThreads)
+
+  stdout.write &"Generating {maxNumInputs} (elems, exponents) pairs ... "
+  stdout.flushFile()
+
+  let start = getMonotime()
+
+  syncScope:
+    for (id, start, size) in items(chunks):
+      result.tp.spawn genElemExponentPairsChunk(rng.next(), start, size, result.elems[0].addr, result.exponents[0].addr)
+
+  # Even if child threads are sleeping, it seems like perf is lower when there are threads around
+  # maybe because the kernel has more overhead or time quantum to keep track off so shut them down.
+  result.tp.shutdown()
+
+  let stop = getMonotime()
+  stdout.write &"in {float64(inNanoSeconds(stop-start)) / 1e6:6.3f} ms\n"
+
+proc multiExpParallelBench*[GT](ctx: var BenchMultiExpContext[GT], numInputs: int, iters: int) =
+  const bits = Fr[GT.Name].bits()
+
+  template elems: untyped = ctx.elems.toOpenArray(0, numInputs-1)
+  template exponents: untyped = ctx.exponents.toOpenArray(0, numInputs-1)
+
+
+  var r{.noInit.}: GT
+  var startNaive, stopNaive, startMultiExpBaseline, stopMultiExpBaseline: MonoTime
+
+  if numInputs <= 100000:
+    # startNaive = getMonotime()
+    bench("𝔾ₜ exponentiations                " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
+      var tmp: GT
+      r.setOne()
+      for i in 0 ..< elems.len:
+        tmp.gtExp(elems[i], exponents[i])
+        r *= tmp
+    # stopNaive = getMonotime()
+
+  if numInputs <= 100000:
+    startNaive = getMonotime()
+    bench("𝔾ₜ exponentiations vartime        " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
+      var tmp: GT
+      r.setOne()
+      for i in 0 ..< elems.len:
+        tmp.gtExp_vartime(elems[i], exponents[i])
+        r *= tmp
+    stopNaive = getMonotime()
+
+  if numInputs <= 100000:
+    startMultiExpBaseline = getMonotime()
+    bench("𝔾ₜ multi-exponentiations baseline " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
+      r.multiExp_reference_vartime(elems, exponents)
+    stopMultiExpBaseline = getMonotime()
+
+
+  let perfNaive = inNanoseconds((stopNaive-startNaive) div iters)
+  let perfMSMbaseline = inNanoseconds((stopMultiExpBaseline-startMultiExpBaseline) div iters)
+
+  if numInputs <= 100000:
+    let speedupBaseline = float(perfNaive) / float(perfMSMbaseline)
+    echo &"Speedup ratio baseline over naive linear combination: {speedupBaseline:>6.3f}x"
diff --git a/benchmarks/bench_gt_template.nim b/benchmarks/bench_gt_template.nim
@@ -27,7 +27,6 @@ import
   helpers/prng_unsafe,
   ./bench_blueprint
 
-
 export notes
 export abstractions
 proc separator*() = separator(168)

diff --git a/constantine.nimble b/constantine.nimble
@@ -654,6 +654,7 @@ const benchDesc = [
   "bench_pairing_bn254_nogami",
   "bench_pairing_bn254_snarks",
   "bench_gt",
+  "bench_gt_multiexp_bls12_381",
   "bench_summary_bls12_377",
   "bench_summary_bls12_381",
   "bench_summary_bn254_nogami",
@@ -1036,6 +1037,12 @@ task bench_ec_g2_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾2 (Multi
 task bench_gt, "Run 𝔾ₜ benchmarks - CC compiler":
   runBench("bench_gt")
 
+# 𝔾ₜ - multi-exponentiation
+# ------------------------------------------
+
+task bench_gt_multiexp_bls12_381, "Run 𝔾ₜ multiexponentiation benchmarks for BLS12-381 - CC compiler":
+  runBench("bench_gt_multiexp_bls12_381")
+
 # Pairings
 # ------------------------------------------
 

diff --git a/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim b/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim
@@ -213,6 +213,7 @@ func bestBucketBitSize*(inputSize: int, scalarBitwidth: static int, useSignedBuc
   # L1, L2 caches, TLB and 64 aliasing conflict
   # are not taken into account in previous formula.
   # Each increase in c doubles memory used.
+  # TODO: use the element size for thresholds.
   when useManualTuning:
     if 14 <= result:
       result -= 1

diff --git a/constantine/math/pairings/cyclotomic_subgroups.nim b/constantine/math/pairings/cyclotomic_subgroups.nim
@@ -225,9 +225,9 @@ func cyclotomic_square_cube_over_quad(r: var CubicExt, a: CubicExt) =
   # https://eprint.iacr.org/2009/565.pdf
 
   # Cubic extension field
-  # A = 3a² − 2 ̄a
-  # B = 3 √i c² + 2 ̄b
-  # C = 3b² − 2 ̄c
+  # A = 3a² − 2a̅
+  # B = 3 √i c² + 2b̅
+  # C = 3b² − 2c̅
   var v0{.noInit.}, v1{.noInit.}, v2{.noInit.}: typeof(a.c0)
 
   template a0: untyped = a.c0.c0
@@ -261,7 +261,7 @@ func cyclotomic_square_cube_over_quad(r: var CubicExt, a: CubicExt) =
   r.c2.c1.double()
   r.c2.c1 += v1.c1
 
-  # Now B = 3 √i c² + 2 ̄b
+  # Now B = 3 √i c² + 2b̅
   # beware of mul by non residue: √i v₂ = ξv₂₁ + v₂₀√i
 
   # 3 (√i c²)₀ + 2a₂
@@ -291,9 +291,9 @@ func cyclotomic_square_quad_over_cube[F](r: var QuadraticExt[F], a: QuadraticExt
   #    c₅     <=>        a₅            <=>            b₅
   #
   # Hence, this formula for a cubic extension field
-  #   A = 3a² − 2 ̄a
-  #   B = 3 √i c² + 2 ̄b
-  #   C = 3b² − 2 ̄c
+  #   A = 3a² − 2a̅
+  #   B = 3 √i c² + 2b̅
+  #   C = 3b² − 2c̅
   #
   # becomes
   #   A = (b₀, b₄) = 3(b₀, b₄)² - 2(b₀,-b₄)