From 92685022c43c9d113126d0bfcb130a773e2c4216 Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Fri, 19 Jul 2024 07:32:41 +0000
Subject: [PATCH] =?UTF-8?q?=F0=9D=94=BE=E2=82=9C=20multi-exponentiations?=
 =?UTF-8?q?=20(#436)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(gt-multiexp): add baseline multi-exponentiation on 𝔾ₜ based on BDLO12

* feat(gt-multiexp): add optimized multi-exponentiation in 𝔾ₜ

* feat(gt-multiexp): add parallel multi-exponentiation in 𝔾ₜ

* feat(gt-multiexp): enable parallel tests in nimble

* magic workaround for https://github.com/nim-lang/Nim/issues/23853
---
 ...mcfg => bench_ec_msm_bandersnatch.nim.cfg} |   0
 .../bench_elliptic_parallel_template.nim      |   4 +-
 benchmarks/bench_gt.nim                       |   1 -
 benchmarks/bench_gt_multiexp_bls12_381.nim    |  46 ++
 .../bench_gt_multiexp_bls12_381.nim.cfg       |   1 +
 benchmarks/bench_gt_parallel_template.nim     | 189 +++++++
 benchmarks/bench_gt_template.nim              |   1 -
 constantine.nimble                            |   9 +
 .../math/elliptic/ec_multi_scalar_mul.nim     |  89 ++--
 .../elliptic/ec_multi_scalar_mul_parallel.nim |  58 +--
 .../ec_multi_scalar_mul_scheduler.nim         |   1 +
 .../math/pairings/cyclotomic_subgroups.nim    |  14 +-
 constantine/math/pairings/gt_multiexp.nim     | 489 ++++++++++++++++++
 .../math/pairings/gt_multiexp_parallel.nim    | 258 +++++++++
 .../math/pairings/pairings_generic.nim        |   9 +-
 .../t_pairing_bls12_377_optate.nim            |   9 +-
 .../t_pairing_bls12_381_gt_exp.nim            |   4 +-
 .../t_pairing_bls12_381_gt_multiexp.nim       |  18 +
 .../t_pairing_bls12_381_gt_subgroup.nim       |  10 +-
 .../t_pairing_bls12_381_optate.nim            |   9 +-
 .../t_pairing_bn254_nogami_gt_subgroup.nim    |  10 +-
 .../t_pairing_bn254_nogami_optate.nim         |   9 +-
 .../t_pairing_bn254_snarks_gt_exp.nim         |   4 +-
 .../t_pairing_bn254_snarks_gt_multiexp.nim    |  18 +
 .../t_pairing_bn254_snarks_gt_subgroup.nim    |  10 +-
 .../t_pairing_bn254_snarks_optate.nim         |   9 +-
 .../t_pairing_bw6_761_gt_subgroup.nim         |  10 +-
 .../t_pairing_bw6_761_optate.nim              |   9 +-
 tests/math_pairings/t_pairing_template.nim    | 108 ++--
 tests/parallel/t_ec_template_parallel.nim     |   2 +-
 ...pairing_bls12_381_gt_multiexp_parallel.nim |  18 +
 .../parallel/t_pairing_template_parallel.nim  |  88 ++++
 32 files changed, 1323 insertions(+), 191 deletions(-)
 rename benchmarks/{bench_ec_msm_bandersnatch.nimcfg => bench_ec_msm_bandersnatch.nim.cfg} (100%)
 create mode 100644 benchmarks/bench_gt_multiexp_bls12_381.nim
 create mode 100644 benchmarks/bench_gt_multiexp_bls12_381.nim.cfg
 create mode 100644 benchmarks/bench_gt_parallel_template.nim
 create mode 100644 constantine/math/pairings/gt_multiexp.nim
 create mode 100644 constantine/math/pairings/gt_multiexp_parallel.nim
 create mode 100644 tests/math_pairings/t_pairing_bls12_381_gt_multiexp.nim
 create mode 100644 tests/math_pairings/t_pairing_bn254_snarks_gt_multiexp.nim
 create mode 100644 tests/parallel/t_pairing_bls12_381_gt_multiexp_parallel.nim
 create mode 100644 tests/parallel/t_pairing_template_parallel.nim

diff --git a/benchmarks/bench_ec_msm_bandersnatch.nimcfg b/benchmarks/bench_ec_msm_bandersnatch.nim.cfg
similarity index 100%
rename from benchmarks/bench_ec_msm_bandersnatch.nimcfg
rename to benchmarks/bench_ec_msm_bandersnatch.nim.cfg
diff --git a/benchmarks/bench_elliptic_parallel_template.nim b/benchmarks/bench_elliptic_parallel_template.nim
index afc0a649c..647a9f81b 100644
--- a/benchmarks/bench_elliptic_parallel_template.nim
+++ b/benchmarks/bench_elliptic_parallel_template.nim
@@ -115,7 +115,7 @@ proc msmParallelBench*[EC](ctx: var BenchMsmContext[EC], numInputs: int, iters:
   var startNaive, stopNaive, startMSMbaseline, stopMSMbaseline, startMSMopt, stopMSMopt, startMSMpara, stopMSMpara: MonoTime
 
   if numInputs <= 100000:
-    startNaive = getMonotime()
+    # startNaive = getMonotime()
     bench("EC scalar muls                " & align($numInputs, 10) & " (" & $bits & "-bit coefs, points)", EC, iters):
       var tmp: EC
       r.setNeutral()
@@ -123,7 +123,7 @@ proc msmParallelBench*[EC](ctx: var BenchMsmContext[EC], numInputs: int, iters:
         tmp.fromAffine(points[i])
         tmp.scalarMul(coefs[i])
         r += tmp
-    stopNaive = getMonotime()
+    # stopNaive = getMonotime()
 
   if numInputs <= 100000:
     startNaive = getMonotime()
diff --git a/benchmarks/bench_gt.nim b/benchmarks/bench_gt.nim
index cc10108ef..a3599266c 100644
--- a/benchmarks/bench_gt.nim
+++ b/benchmarks/bench_gt.nim
@@ -33,7 +33,6 @@ proc main() =
   separator()
   staticFor i, 0, AvailableCurves.len:
     const curve = AvailableCurves[i]
-    const bits = Fr[curve].bits()
     separator()
     mulBench(Fp12[curve], Iters)
     sqrBench(Fp12[curve], Iters)
diff --git a/benchmarks/bench_gt_multiexp_bls12_381.nim b/benchmarks/bench_gt_multiexp_bls12_381.nim
new file mode 100644
index 000000000..ef60000af
--- /dev/null
+++ b/benchmarks/bench_gt_multiexp_bls12_381.nim
@@ -0,0 +1,46 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  constantine/named/algebras,
+  constantine/math/extension_fields,
+  # Helpers
+  ./bench_gt_parallel_template
+
+# ############################################################
+#
+#               Benchmark of the 𝔾ₜ group of
+#                  Pairing Friendly curves
+#
+# ############################################################
+
+const Iters = 10000
+const AvailableCurves = [
+  # BN254_Nogami,
+  # BN254_Snarks,
+  # BLS12_377,
+  BLS12_381,
+]
+
+const testNumPoints = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+
+proc main() =
+  separator()
+  staticFor i, 0, AvailableCurves.len:
+    const curve = AvailableCurves[i]
+    var ctx = createBenchMultiExpContext(Fp12[curve], testNumPoints)
+    separator()
+    for numPoints in testNumPoints:
+      let batchIters = max(1, Iters div numPoints)
+      ctx.multiExpParallelBench(numPoints, batchIters)
+      separator()
+    separator()
+
+main()
+notes()
diff --git a/benchmarks/bench_gt_multiexp_bls12_381.nim.cfg b/benchmarks/bench_gt_multiexp_bls12_381.nim.cfg
new file mode 100644
index 000000000..aed303eef
--- /dev/null
+++ b/benchmarks/bench_gt_multiexp_bls12_381.nim.cfg
@@ -0,0 +1 @@
+--threads:on
diff --git a/benchmarks/bench_gt_parallel_template.nim b/benchmarks/bench_gt_parallel_template.nim
new file mode 100644
index 000000000..0e8044001
--- /dev/null
+++ b/benchmarks/bench_gt_parallel_template.nim
@@ -0,0 +1,189 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#             Summary of the performance of a curve
+#
+# ############################################################
+
+import
+  # Standard library
+  std/[monotimes, times],
+  # Internals
+  constantine/platforms/abstractions,
+  constantine/named/algebras,
+  constantine/math/[arithmetic, extension_fields],
+  constantine/math/pairings/[
+    pairings_generic,
+    gt_exponentiations,
+    gt_exponentiations_vartime,
+    gt_multiexp, gt_multiexp_parallel,
+  ],
+  constantine/threadpool,
+  # Helpers
+  helpers/prng_unsafe,
+  ./bench_blueprint
+
+export times, monotimes
+export notes
+export abstractions
+proc separator*() = separator(168)
+
+proc report(op, domain: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
+  let ns = inNanoseconds((stop-start) div iters)
+  let throughput = 1e9 / float64(ns)
+  when SupportsGetTicks:
+    echo &"{op:<68} {domain:<20} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
+  else:
+    echo &"{op:<68} {domain:<20} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
+
+macro fixFieldDisplay(T: typedesc): untyped =
+  # At compile-time, enums are integers and their display is buggy
+  # we get the Curve ID instead of the curve name.
+  let instantiated = T.getTypeInst()
+  var name = $instantiated[1][0] # Fp
+  name.add "[" & $Algebra(instantiated[1][1].intVal) & "]"
+  result = newLit name
+
+func fixDisplay(T: typedesc): string =
+  when T is (Fp or Fp2 or Fp4 or Fp6 or Fp12):
+    fixFieldDisplay(T)
+  else:
+    $T
+
+func fixDisplay(T: Algebra): string =
+  $T
+
+template bench(op: string, T: typed, iters: int, body: untyped): untyped =
+  measure(iters, startTime, stopTime, startClk, stopClk, body)
+  report(op, fixDisplay(T), startTime, stopTime, startClk, stopClk, iters)
+
+func random_gt*(rng: var RngState, F: typedesc): F {.inline, noInit.} =
+  result = rng.random_unsafe(F)
+  result.finalExp()
+
+# Multi-exponentiations
+# ---------------------------------------------------------------------------
+
+type BenchMultiexpContext*[GT] = object
+  tp: Threadpool
+  numInputs: int
+  exponents: seq[getBigInt(GT.Name(), kScalarField)]
+  elems: seq[GT]
+
+proc createBenchMultiExpContext*(GT: typedesc, inputSizes: openArray[int]): BenchMultiexpContext[GT] =
+  result.tp = Threadpool.new()
+  let maxNumInputs = inputSizes.max()
+
+  const bits = Fr[GT.Name].bits()
+
+  result.numInputs = maxNumInputs
+  result.elems = newSeq[GT](maxNumInputs)
+  result.exponents = newSeq[BigInt[bits]](maxNumInputs)
+
+  proc genElemExponentPairsChunk[GT](rngSeed: uint64, start, len: int, elems: ptr GT, exponents: ptr BigInt[bits]) {.nimcall.} =
+    let elems = cast[ptr UncheckedArray[GT]](elems)
+    let exponents = cast[ptr UncheckedArray[BigInt[bits]]](exponents)
+
+    # RNGs are not threadsafe, create a threadlocal one seeded from the global RNG
+    var threadRng: RngState
+    threadRng.seed(rngSeed)
+
+    for i in start ..< start + len:
+      elems[i] = threadRng.random_gt(GT)
+      exponents[i] = threadRng.random_unsafe(BigInt[bits])
+
+  let chunks = balancedChunksPrioNumber(0, maxNumInputs, result.tp.numThreads)
+
+  stdout.write &"Generating {maxNumInputs} (elems, exponents) pairs ... "
+  stdout.flushFile()
+
+  let start = getMonotime()
+
+  syncScope:
+    for (id, start, size) in items(chunks):
+      result.tp.spawn genElemExponentPairsChunk(rng.next(), start, size, result.elems[0].addr, result.exponents[0].addr)
+
+  # Even if child threads are sleeping, it seems like perf is lower when there are threads around
+  # maybe because the kernel has more overhead or time quantum to keep track off so shut them down.
+  result.tp.shutdown()
+
+  let stop = getMonotime()
+  stdout.write &"in {float64(inNanoSeconds(stop-start)) / 1e6:6.3f} ms\n"
+
+proc multiExpParallelBench*[GT](ctx: var BenchMultiExpContext[GT], numInputs: int, iters: int) =
+  const bits = Fr[GT.Name].bits()
+
+  template elems: untyped = ctx.elems.toOpenArray(0, numInputs-1)
+  template exponents: untyped = ctx.exponents.toOpenArray(0, numInputs-1)
+
+
+  var r{.noInit.}: GT
+  var startNaive, stopNaive, startMultiExpBaseline, stopMultiExpBaseline: MonoTime
+  var startMultiExpOpt, stopMultiExpOpt, startMultiExpPara, stopMultiExpPara: MonoTime
+
+  if numInputs <= 100000:
+    # startNaive = getMonotime()
+    bench("𝔾ₜ exponentiations                 " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
+      var tmp: GT
+      r.setOne()
+      for i in 0 ..< elems.len:
+        tmp.gtExp(elems[i], exponents[i])
+        r *= tmp
+    # stopNaive = getMonotime()
+
+  if numInputs <= 100000:
+    startNaive = getMonotime()
+    bench("𝔾ₜ exponentiations vartime         " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
+      var tmp: GT
+      r.setOne()
+      for i in 0 ..< elems.len:
+        tmp.gtExp_vartime(elems[i], exponents[i])
+        r *= tmp
+    stopNaive = getMonotime()
+
+  if numInputs <= 100000:
+    startMultiExpBaseline = getMonotime()
+    bench("𝔾ₜ multi-exponentiations baseline  " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
+      r.multiExp_reference_vartime(elems, exponents)
+    stopMultiExpBaseline = getMonotime()
+
+  block:
+    startMultiExpOpt = getMonotime()
+    bench("𝔾ₜ multi-exponentiations optimized " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
+      r.multiExp_vartime(elems, exponents)
+    stopMultiExpOpt = getMonotime()
+
+  block:
+    ctx.tp = Threadpool.new()
+
+    startMultiExpPara = getMonotime()
+    bench("𝔾ₜ multi-exponentiations" & align($ctx.tp.numThreads & " threads", 11) & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
+      ctx.tp.multiExp_vartime_parallel(r, elems, exponents)
+    stopMultiExpPara = getMonotime()
+
+    ctx.tp.shutdown()
+
+  let perfNaive = inNanoseconds((stopNaive-startNaive) div iters)
+  let perfMultiExpBaseline = inNanoseconds((stopMultiExpBaseline-startMultiExpBaseline) div iters)
+  let perfMultiExpOpt = inNanoseconds((stopMultiExpOpt-startMultiExpOpt) div iters)
+  let perfMultiExpPara = inNanoseconds((stopMultiExpPara-startMultiExpPara) div iters)
+
+  if numInputs <= 100000:
+    let speedupBaseline = float(perfNaive) / float(perfMultiExpBaseline)
+    echo &"Speedup ratio baseline over naive linear combination: {speedupBaseline:>6.3f}x"
+
+    let speedupOpt = float(perfNaive) / float(perfMultiExpOpt)
+    echo &"Speedup ratio optimized over naive linear combination: {speedupOpt:>6.3f}x"
+
+    let speedupOptBaseline = float(perfMultiExpBaseline) / float(perfMultiExpOpt)
+    echo &"Speedup ratio optimized over baseline linear combination: {speedupOptBaseline:>6.3f}x"
+
+  let speedupParaOpt = float(perfMultiExpOpt) / float(perfMultiExpPara)
+  echo &"Speedup ratio parallel over optimized linear combination: {speedupParaOpt:>6.3f}x"
diff --git a/benchmarks/bench_gt_template.nim b/benchmarks/bench_gt_template.nim
index 4e41b5f1f..a4d8d9e0f 100644
--- a/benchmarks/bench_gt_template.nim
+++ b/benchmarks/bench_gt_template.nim
@@ -27,7 +27,6 @@ import
   helpers/prng_unsafe,
   ./bench_blueprint
 
-
 export notes
 export abstractions
 proc separator*() = separator(168)
diff --git a/constantine.nimble b/constantine.nimble
index 93c3a02d6..89ed69a04 100644
--- a/constantine.nimble
+++ b/constantine.nimble
@@ -637,6 +637,8 @@ const testDescMultithreadedCrypto: seq[string] = @[
   "tests/parallel/t_ec_shortw_prj_g1_batch_add_parallel.nim",
   "tests/parallel/t_ec_shortw_jac_g1_msm_parallel.nim",
   "tests/parallel/t_ec_shortw_prj_g1_msm_parallel.nim",
+  "tests/parallel/t_ec_twedwards_prj_msm_parallel.nim",
+  "tests/parallel/t_pairing_bls12_381_gt_multiexp_parallel.nim",
 ]
 
 const benchDesc = [
@@ -661,6 +663,7 @@ const benchDesc = [
   "bench_pairing_bn254_nogami",
   "bench_pairing_bn254_snarks",
   "bench_gt",
+  "bench_gt_multiexp_bls12_381",
   "bench_summary_bls12_377",
   "bench_summary_bls12_381",
   "bench_summary_bn254_nogami",
@@ -1043,6 +1046,12 @@ task bench_ec_g2_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾2 (Multi
 task bench_gt, "Run 𝔾ₜ benchmarks - CC compiler":
   runBench("bench_gt")
 
+# 𝔾ₜ - multi-exponentiation
+# ------------------------------------------
+
+task bench_gt_multiexp_bls12_381, "Run 𝔾ₜ multiexponentiation benchmarks for BLS12-381 - CC compiler":
+  runBench("bench_gt_multiexp_bls12_381")
+
 # Pairings
 # ------------------------------------------
 
diff --git a/constantine/math/elliptic/ec_multi_scalar_mul.nim b/constantine/math/elliptic/ec_multi_scalar_mul.nim
index 51f9e9573..25c1fc754 100644
--- a/constantine/math/elliptic/ec_multi_scalar_mul.nim
+++ b/constantine/math/elliptic/ec_multi_scalar_mul.nim
@@ -253,7 +253,7 @@ func miniMSM[bits: static int, EC, ECaff](
     for _ in 0 ..< c:
       r.double()
 
-func multiScalarMul_vartime*[bits: static int, EC, ECaff](
+func msmImpl_vartime[bits: static int, EC, ECaff](
        r: var EC,
        coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECaff],
        N: int, c: static int) {.tags:[VarTime, HeapAlloc], meter.} =
@@ -295,6 +295,9 @@ func multiScalarMul_vartime*[bits: static int, EC, ECaff](
   # -------
   buckets.freeHeap()
 
+# Multi scalar multiplication with batched affine additions
+# -----------------------------------------------------------------------------------------------------------------------
+
 func schedAccumulate*[NumBuckets, QueueLen, F, G; bits: static int](
        sched: ptr Scheduler[NumBuckets, QueueLen, F, G],
        bitIndex: int, miniMsmKind: static MiniMsmKind, c: static int,
@@ -344,7 +347,7 @@ func miniMSM_affine[NumBuckets, QueueLen, EC, ECaff; bits: static int](
     for _ in 0 ..< c:
       r.double()
 
-func multiScalarMulAffine_vartime[bits: static int, EC, ECaff](
+func msmAffineImpl_vartime[bits: static int, EC, ECaff](
        r: var EC,
        coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECaff],
        N: int, c: static int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
@@ -389,6 +392,9 @@ func multiScalarMulAffine_vartime[bits: static int, EC, ECaff](
   sched.freeHeap()
   buckets.freeHeap()
 
+# Endomorphism acceleration
+# -----------------------------------------------------------------------------------------------------------------------
+
 proc applyEndomorphism[bits: static int, ECaff](
        coefs: ptr UncheckedArray[BigInt[bits]],
        points: ptr UncheckedArray[ECaff],
@@ -403,7 +409,7 @@ proc applyEndomorphism[bits: static int, ECaff](
   const G = when ECaff isnot EC_ShortW_Aff: G1
             else: ECaff.G
 
-  const L = ECaff.getScalarField().bits().ceilDiv_vartime(M) + 1
+  const L = ECaff.getScalarField().bits().computeEndoRecodedLength(M)
   let splitCoefs   = allocHeapArray(array[M, BigInt[L]], N)
   let endoBasis    = allocHeapArray(array[M, ECaff], N)
 
@@ -447,7 +453,10 @@ template withEndo[coefsBits: static int, EC, ECaff](
   else:
     msmProc(r, coefs, points, N, c)
 
-func multiScalarMul_dispatch_vartime[bits: static int, F, G](
+# Algorithm selection
+# -----------------------------------------------------------------------------------------------------------------------
+
+func msm_dispatch_vartime[bits: static int, F, G](
        r: var (EC_ShortW_Jac[F, G] or EC_ShortW_Prj[F, G]),
        coefs: ptr UncheckedArray[BigInt[bits]],
        points: ptr UncheckedArray[EC_ShortW_Aff[F, G]], N: int) =
@@ -460,27 +469,27 @@ func multiScalarMul_dispatch_vartime[bits: static int, F, G](
   # but it has no significant impact on performance
 
   case c
-  of  2: withEndo(multiScalarMul_vartime, r, coefs, points, N, c =  2)
-  of  3: withEndo(multiScalarMul_vartime, r, coefs, points, N, c =  3)
-  of  4: withEndo(multiScalarMul_vartime, r, coefs, points, N, c =  4)
-  of  5: withEndo(multiScalarMul_vartime, r, coefs, points, N, c =  5)
-  of  6: withEndo(multiScalarMul_vartime, r, coefs, points, N, c =  6)
-  of  7: withEndo(multiScalarMul_vartime, r, coefs, points, N, c =  7)
-  of  8: withEndo(multiScalarMul_vartime, r, coefs, points, N, c =  8)
-
-  of  9: withEndo(multiScalarMulAffine_vartime, r, coefs, points, N, c =  9)
-  of 10: withEndo(multiScalarMulAffine_vartime, r, coefs, points, N, c = 10)
-  of 11: withEndo(multiScalarMulAffine_vartime, r, coefs, points, N, c = 11)
-  of 12: withEndo(multiScalarMulAffine_vartime, r, coefs, points, N, c = 12)
-  of 13: withEndo(multiScalarMulAffine_vartime, r, coefs, points, N, c = 13)
-  of 14: multiScalarMulAffine_vartime(r, coefs, points, N, c = 14)
-  of 15: multiScalarMulAffine_vartime(r, coefs, points, N, c = 15)
-
-  of 16..17: multiScalarMulAffine_vartime(r, coefs, points, N, c = 16)
+  of  2: withEndo(msmImpl_vartime, r, coefs, points, N, c =  2)
+  of  3: withEndo(msmImpl_vartime, r, coefs, points, N, c =  3)
+  of  4: withEndo(msmImpl_vartime, r, coefs, points, N, c =  4)
+  of  5: withEndo(msmImpl_vartime, r, coefs, points, N, c =  5)
+  of  6: withEndo(msmImpl_vartime, r, coefs, points, N, c =  6)
+  of  7: withEndo(msmImpl_vartime, r, coefs, points, N, c =  7)
+  of  8: withEndo(msmImpl_vartime, r, coefs, points, N, c =  8)
+
+  of  9: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c =  9)
+  of 10: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 10)
+  of 11: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 11)
+  of 12: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 12)
+  of 13: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 13)
+  of 14: msmAffineImpl_vartime(r, coefs, points, N, c = 14)
+  of 15: msmAffineImpl_vartime(r, coefs, points, N, c = 15)
+
+  of 16..17: msmAffineImpl_vartime(r, coefs, points, N, c = 16)
   else:
     unreachable()
 
-func multiScalarMul_dispatch_vartime[bits: static int, F](
+func msm_dispatch_vartime[bits: static int, F](
        r: var EC_TwEdw_Prj[F], coefs: ptr UncheckedArray[BigInt[bits]],
        points: ptr UncheckedArray[EC_TwEdw_Aff[F]], N: int) =
   ## Multiscalar multiplication:
@@ -494,22 +503,22 @@ func multiScalarMul_dispatch_vartime[bits: static int, F](
   # but it has no significant impact on performance
 
   case c
-  of  2: withEndo(multiScalarMul_vartime, r, coefs, points, N, c =  2)
-  of  3: withEndo(multiScalarMul_vartime, r, coefs, points, N, c =  3)
-  of  4: withEndo(multiScalarMul_vartime, r, coefs, points, N, c =  4)
-  of  5: withEndo(multiScalarMul_vartime, r, coefs, points, N, c =  5)
-  of  6: withEndo(multiScalarMul_vartime, r, coefs, points, N, c =  6)
-  of  7: withEndo(multiScalarMul_vartime, r, coefs, points, N, c =  7)
-  of  8: withEndo(multiScalarMul_vartime, r, coefs, points, N, c =  8)
-  of  9: withEndo(multiScalarMul_vartime, r, coefs, points, N, c =  9)
-  of 10: withEndo(multiScalarMul_vartime, r, coefs, points, N, c = 10)
-  of 11: withEndo(multiScalarMul_vartime, r, coefs, points, N, c = 11)
-  of 12: withEndo(multiScalarMul_vartime, r, coefs, points, N, c = 12)
-  of 13: withEndo(multiScalarMul_vartime, r, coefs, points, N, c = 13)
-  of 14: multiScalarMul_vartime(r, coefs, points, N, c = 14)
-  of 15: multiScalarMul_vartime(r, coefs, points, N, c = 15)
-
-  of 16..17: multiScalarMul_vartime(r, coefs, points, N, c = 16)
+  of  2: withEndo(msmImpl_vartime, r, coefs, points, N, c =  2)
+  of  3: withEndo(msmImpl_vartime, r, coefs, points, N, c =  3)
+  of  4: withEndo(msmImpl_vartime, r, coefs, points, N, c =  4)
+  of  5: withEndo(msmImpl_vartime, r, coefs, points, N, c =  5)
+  of  6: withEndo(msmImpl_vartime, r, coefs, points, N, c =  6)
+  of  7: withEndo(msmImpl_vartime, r, coefs, points, N, c =  7)
+  of  8: withEndo(msmImpl_vartime, r, coefs, points, N, c =  8)
+  of  9: withEndo(msmImpl_vartime, r, coefs, points, N, c =  9)
+  of 10: withEndo(msmImpl_vartime, r, coefs, points, N, c = 10)
+  of 11: withEndo(msmImpl_vartime, r, coefs, points, N, c = 11)
+  of 12: withEndo(msmImpl_vartime, r, coefs, points, N, c = 12)
+  of 13: withEndo(msmImpl_vartime, r, coefs, points, N, c = 13)
+  of 14: msmImpl_vartime(r, coefs, points, N, c = 14)
+  of 15: msmImpl_vartime(r, coefs, points, N, c = 15)
+
+  of 16..17: msmImpl_vartime(r, coefs, points, N, c = 16)
   else:
     unreachable()
 
@@ -521,7 +530,7 @@ func multiScalarMul_vartime*[bits: static int, EC, ECaff](
   ## Multiscalar multiplication:
   ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁
 
-  multiScalarMul_dispatch_vartime(r, coefs, points, len)
+  msm_dispatch_vartime(r, coefs, points, len)
 
 func multiScalarMul_vartime*[bits: static int, EC, ECaff](
        r: var EC,
@@ -531,7 +540,7 @@ func multiScalarMul_vartime*[bits: static int, EC, ECaff](
   ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁
   debug: doAssert coefs.len == points.len
   let N = points.len
-  multiScalarMul_dispatch_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N)
+  msm_dispatch_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N)
 
 func multiScalarMul_vartime*[F, EC, ECaff](
        r: var EC,
diff --git a/constantine/math/elliptic/ec_multi_scalar_mul_parallel.nim b/constantine/math/elliptic/ec_multi_scalar_mul_parallel.nim
index be5a5c327..0f6d6111a 100644
--- a/constantine/math/elliptic/ec_multi_scalar_mul_parallel.nim
+++ b/constantine/math/elliptic/ec_multi_scalar_mul_parallel.nim
@@ -12,7 +12,7 @@ import constantine/named/algebras,
        constantine/math/endomorphisms/split_scalars,
        constantine/math/extension_fields,
        constantine/named/zoo_endomorphisms,
-       ../../threadpool/[threadpool, partitioners]
+       constantine/threadpool/[threadpool, partitioners]
 export bestBucketBitSize
 
 # No exceptions allowed in core cryptographic operations
@@ -145,7 +145,7 @@ proc bucketAccumReduce_withInit[bits: static int, EC, ECaff](
     buckets[i].setNeutral()
   bucketAccumReduce(windowSum[], buckets, bitIndex, miniMsmKind, c, coefs, points, N)
 
-proc msm_vartime_parallel[bits: static int, EC, ECaff](
+proc msmImpl_vartime_parallel[bits: static int, EC, ECaff](
        tp: Threadpool,
        r: ptr EC,
        coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[EC_aff],
@@ -465,7 +465,7 @@ proc applyEndomorphism_parallel[bits: static int, ECaff](
   const G = when ECaff isnot EC_ShortW_Aff: G1
             else: ECaff.G
 
-  const L = ECaff.getScalarField().bits().ceilDiv_vartime(M) + 1
+  const L = ECaff.getScalarField().bits().computeEndoRecodedLength(M)
   let splitCoefs   = allocHeapArray(array[M, BigInt[L]], N)
   let endoBasis    = allocHeapArray(array[M, ECaff], N)
 
@@ -544,14 +544,14 @@ proc multiScalarMul_dispatch_vartime_parallel[bits: static int, F, G](
   # but it has no significant impact on performance
 
   case c
-  of  2: withEndo(msm_vartime_parallel, tp, r, coefs, points, N, c =  2)
-  of  3: withEndo(msm_vartime_parallel, tp, r, coefs, points, N, c =  3)
-  of  4: withEndo(msm_vartime_parallel, tp, r, coefs, points, N, c =  4)
-  of  5: withEndo(msm_vartime_parallel, tp, r, coefs, points, N, c =  5)
-  of  6: withEndo(msm_vartime_parallel, tp, r, coefs, points, N, c =  6)
+  of  2: withEndo(msmImpl_vartime_parallel, tp, r, coefs, points, N, c =  2)
+  of  3: withEndo(msmImpl_vartime_parallel, tp, r, coefs, points, N, c =  3)
+  of  4: withEndo(msmImpl_vartime_parallel, tp, r, coefs, points, N, c =  4)
+  of  5: withEndo(msmImpl_vartime_parallel, tp, r, coefs, points, N, c =  5)
+  of  6: withEndo(msmImpl_vartime_parallel, tp, r, coefs, points, N, c =  6)
 
-  of  7: msm_vartime_parallel(tp, r, coefs, points, N, c =  7)
-  of  8: msm_vartime_parallel(tp, r, coefs, points, N, c =  8)
+  of  7: msmImpl_vartime_parallel(tp, r, coefs, points, N, c =  7)
+  of  8: msmImpl_vartime_parallel(tp, r, coefs, points, N, c =  8)
 
   of  9: withEndo(msmAffine_vartime_parallel_split, tp, r, coefs, points, N, c =  9, useParallelBuckets = true)
   of 10: withEndo(msmAffine_vartime_parallel_split, tp, r, coefs, points, N, c = 10, useParallelBuckets = true)
@@ -579,23 +579,23 @@ proc multiScalarMul_dispatch_vartime_parallel[bits: static int, F](
   # but it has no significant impact on performance
 
   case c
-  of  2: withEndo(msm_vartime_parallel, tp, r, coefs, points, N, c =  2)
-  of  3: withEndo(msm_vartime_parallel, tp, r, coefs, points, N, c =  3)
-  of  4: withEndo(msm_vartime_parallel, tp, r, coefs, points, N, c =  4)
-  of  5: withEndo(msm_vartime_parallel, tp, r, coefs, points, N, c =  5)
-  of  6: withEndo(msm_vartime_parallel, tp, r, coefs, points, N, c =  6)
-
-  of   7: msm_vartime_parallel(tp, r, coefs, points, N, c =  7)
-  of   8: msm_vartime_parallel(tp, r, coefs, points, N, c =  8)
-  of   9: msm_vartime_parallel(tp, r, coefs, points, N, c =  9)
-  of  10: msm_vartime_parallel(tp, r, coefs, points, N, c = 10)
-  of  11: msm_vartime_parallel(tp, r, coefs, points, N, c = 11)
-  of  12: msm_vartime_parallel(tp, r, coefs, points, N, c = 12)
-  of  13: msm_vartime_parallel(tp, r, coefs, points, N, c = 13)
-  of  14: msm_vartime_parallel(tp, r, coefs, points, N, c = 14)
-  of  15: msm_vartime_parallel(tp, r, coefs, points, N, c = 16)
-
-  of  16..17: msm_vartime_parallel(tp, r, coefs, points, N, c = 16)
+  of  2: withEndo(msmImpl_vartime_parallel, tp, r, coefs, points, N, c =  2)
+  of  3: withEndo(msmImpl_vartime_parallel, tp, r, coefs, points, N, c =  3)
+  of  4: withEndo(msmImpl_vartime_parallel, tp, r, coefs, points, N, c =  4)
+  of  5: withEndo(msmImpl_vartime_parallel, tp, r, coefs, points, N, c =  5)
+  of  6: withEndo(msmImpl_vartime_parallel, tp, r, coefs, points, N, c =  6)
+
+  of   7: msmImpl_vartime_parallel(tp, r, coefs, points, N, c =  7)
+  of   8: msmImpl_vartime_parallel(tp, r, coefs, points, N, c =  8)
+  of   9: msmImpl_vartime_parallel(tp, r, coefs, points, N, c =  9)
+  of  10: msmImpl_vartime_parallel(tp, r, coefs, points, N, c = 10)
+  of  11: msmImpl_vartime_parallel(tp, r, coefs, points, N, c = 11)
+  of  12: msmImpl_vartime_parallel(tp, r, coefs, points, N, c = 12)
+  of  13: msmImpl_vartime_parallel(tp, r, coefs, points, N, c = 13)
+  of  14: msmImpl_vartime_parallel(tp, r, coefs, points, N, c = 14)
+  of  15: msmImpl_vartime_parallel(tp, r, coefs, points, N, c = 16)
+
+  of  16..17: msmImpl_vartime_parallel(tp, r, coefs, points, N, c = 16)
   else:
     unreachable()
 
@@ -620,7 +620,6 @@ proc multiScalarMul_vartime_parallel*[bits: static int, EC, ECaff](
   ## This function can be nested in another parallel function
   debug: doAssert coefs.len == points.len
   let N = points.len
-
   tp.multiScalarMul_dispatch_vartime_parallel(r.addr, coefs.asUnchecked(), points.asUnchecked(), N)
 
 proc multiScalarMul_vartime_parallel*[F, EC, ECaff](
@@ -631,7 +630,6 @@ proc multiScalarMul_vartime_parallel*[F, EC, ECaff](
        len: int) {.meter.} =
   ## Multiscalar multiplication:
   ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁
-
   let n = cast[int](len)
   let coefs_big = allocHeapArrayAligned(F.getBigInt(), n, alignment = 64)
 
@@ -650,8 +648,6 @@ proc multiScalarMul_vartime_parallel*[EC, ECaff](
        points: openArray[ECaff]) {.inline.} =
   ## Multiscalar multiplication:
   ##   r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁
-
   debug: doAssert coefs.len == points.len
   let N = points.len
-
   tp.multiScalarMul_vartime_parallel(r.addr, coefs.asUnchecked(), points.asUnchecked(), N)
diff --git a/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim b/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim
index f18af92ac..043b326b2 100644
--- a/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim
+++ b/constantine/math/elliptic/ec_multi_scalar_mul_scheduler.nim
@@ -213,6 +213,7 @@ func bestBucketBitSize*(inputSize: int, scalarBitwidth: static int, useSignedBuc
   # L1, L2 caches, TLB and 64 aliasing conflict
   # are not taken into account in previous formula.
   # Each increase in c doubles memory used.
+  # TODO: use the element size for thresholds.
   when useManualTuning:
     if 14 <= result:
       result -= 1
diff --git a/constantine/math/pairings/cyclotomic_subgroups.nim b/constantine/math/pairings/cyclotomic_subgroups.nim
index f04714c93..9bb54e401 100644
--- a/constantine/math/pairings/cyclotomic_subgroups.nim
+++ b/constantine/math/pairings/cyclotomic_subgroups.nim
@@ -225,9 +225,9 @@ func cyclotomic_square_cube_over_quad(r: var CubicExt, a: CubicExt) =
   # https://eprint.iacr.org/2009/565.pdf
 
   # Cubic extension field
-  # A = 3a² − 2 ̄a
-  # B = 3 √i c² + 2 ̄b
-  # C = 3b² − 2 ̄c
+  # A = 3a² − 2a̅
+  # B = 3 √i c² + 2b̅
+  # C = 3b² − 2c̅
   var v0{.noInit.}, v1{.noInit.}, v2{.noInit.}: typeof(a.c0)
 
   template a0: untyped = a.c0.c0
@@ -261,7 +261,7 @@ func cyclotomic_square_cube_over_quad(r: var CubicExt, a: CubicExt) =
   r.c2.c1.double()
   r.c2.c1 += v1.c1
 
-  # Now B = 3 √i c² + 2 ̄b
+  # Now B = 3 √i c² + 2b̅
   # beware of mul by non residue: √i v₂ = ξv₂₁ + v₂₀√i
 
   # 3 (√i c²)₀ + 2a₂
@@ -291,9 +291,9 @@ func cyclotomic_square_quad_over_cube[F](r: var QuadraticExt[F], a: QuadraticExt
   #    c₅     <=>        a₅            <=>            b₅
   #
   # Hence, this formula for a cubic extension field
-  #   A = 3a² − 2 ̄a
-  #   B = 3 √i c² + 2 ̄b
-  #   C = 3b² − 2 ̄c
+  #   A = 3a² − 2a̅
+  #   B = 3 √i c² + 2b̅
+  #   C = 3b² − 2c̅
   #
   # becomes
   #   A = (b₀, b₄) = 3(b₀, b₄)² - 2(b₀,-b₄)
diff --git a/constantine/math/pairings/gt_multiexp.nim b/constantine/math/pairings/gt_multiexp.nim
new file mode 100644
index 000000000..0e78fce5f
--- /dev/null
+++ b/constantine/math/pairings/gt_multiexp.nim
@@ -0,0 +1,489 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import constantine/named/algebras,
+       constantine/math/endomorphisms/split_scalars,
+       constantine/math/extension_fields,
+       constantine/math/arithmetic/bigints,
+       constantine/named/zoo_endomorphisms,
+       constantine/platforms/abstractions,
+       ./cyclotomic_subgroups
+
+# No exceptions allowed in core cryptographic operations
+{.push raises: [].}
+{.push checks: off.}
+
+# ########################################################### #
+#                                                             #
+#             Multi-Exponentiation in 𝔾ₜ                      #
+#                                                             #
+# ########################################################### #
+
+# General utilities
+# -------------------------------------------------------------
+
+func bestBucketBitSize(inputSize: int, scalarBitwidth: static int, useSignedBuckets, useManualTuning: static bool): int {.inline.} =
+  ## Evaluate the best bucket bit-size for the input size.
+  ## That bucket size minimize group operations.
+  ## This ignore cache effect. Computation can become memory-bound, especially with large buckets
+  ## that don't fit in L1 cache, trigger the 64K aliasing conflict or worse (overflowing L2 cache or TLB).
+  ## Especially, scalars are expected to be indistinguishable from random so buckets accessed during accumulation
+  ## will be in a random pattern, triggering cache misses.
+
+  # Raw operation cost is approximately
+  # 1. Bucket accumulation
+  #      n - (2ᶜ-1) mul for b/c windows    or n - (2ᶜ⁻¹-1) if using signed buckets
+  # 2. Bucket reduction
+  #      2x(2ᶜ-2) mul for b/c windows      or 2*(2ᶜ⁻¹-2)
+  # 3. Final reduction
+  #      (b/c - 1) x (c cyclotomic squarings + 1 multiplication)
+  # Total
+  #   b/c (n + 2ᶜ - 2) A + (b/c - 1) * (c*D + A)
+  # https://www.youtube.com/watch?v=Bl5mQA7UL2I
+
+  # A cyclotomic square costs ~50% of a 𝔾ₜ multiplication with Granger-Scott formula
+
+  const M = 5300'f32  # Mul cost (in cycles)
+  const S = 2100'f32  # Cyclotomic square cost (in cycles)
+
+  const s = int useSignedBuckets
+  let n = inputSize
+  let b = float32(scalarBitwidth)
+  var minCost = float32(Inf)
+  for c in 2 .. 20: # cap return value at 17 after manual tuning
+    let b_over_c = b/c.float32
+
+    let bucket_accumulate_reduce = b_over_c * float32(n + (1 shl (c-s)) - 2) * M
+    let final_reduction = (b_over_c - 1'f32) * (c.float32*S + M)
+    let cost = bucket_accumulate_reduce + final_reduction
+    if cost < minCost:
+      minCost = cost
+      result = c
+
+  # Manual tuning, memory bandwidth / cache boundaries of
+  # L1, L2 caches, TLB and 64 aliasing conflict
+  # are not taken into account in previous formula.
+  # Each increase in c doubles memory used.
+  # Compared to 𝔾₁, 𝔾ₜ elements are 6x bigger so we shift by 3
+  when useManualTuning:
+    if 11 <= result:
+      result -= 1
+    if 12 <= result:
+      result -= 1
+    if 13 <= result:
+      result -= 1
+
+func `~*=`[Gt: ExtensionField](a: var Gt, b: Gt) {.inline.} =
+  # TODO: Analyze the inputs to see if there is avalue in more complex shortcuts (-1, or partial 0 coordinates)
+  if a.isOne().bool():
+    a = b
+  elif b.isOne().bool():
+    discard
+  else:
+    a *= b
+
+func `~/=`[Gt: ExtensionField](a: var Gt, b: Gt) {.inline.} =
+  ## Cyclotomic division
+  var t {.noInit.}: Gt
+  t.cyclotomic_inv(b)
+  a ~*= t
+
+func setNeutral[Gt: ExtensionField](a: var Gt) {.inline.} =
+  a.setOne()
+
+# Reference multi-exponentiation
+# -------------------------------------------------------------
+
+func multiExpImpl_reference_vartime[bits: static int, Gt](
+       r: var Gt,
+       elems: ptr UncheckedArray[Gt],
+       expos: ptr UncheckedArray[BigInt[bits]],
+       N: int, c: static int) {.tags:[VarTime, HeapAlloc].} =
+  ## Inner implementation of MEXP, for static dispatch over c, the bucket bit length
+  ## This is a straightforward simple translation of BDLO12, section 4
+
+  # Prologue
+  # --------
+  const numBuckets = 1 shl c - 1 # bucket 0 is unused
+  const numWindows = bits.ceilDiv_vartime(c)
+
+  let miniEXPs = allocHeapArray(Gt, numWindows)
+  let buckets = allocHeapArray(Gt, numBuckets)
+
+  # Algorithm
+  # ---------
+  for w in 0 ..< numWindows:
+    # Place our elements in a bucket corresponding to
+    # how many times their bit pattern in the current window of size c
+    for i in 0 ..< numBuckets:
+      buckets[i].setNeutral()
+
+    # 1. Bucket accumulation.                            Cost: n - (2ᶜ-1) => n elems in 2ᶜ-1 elems, first elem per bucket is just copied
+    for j in 0 ..< N:
+      let b = cast[int](expos[j].getWindowAt(w*c, c))
+      if b == 0: # bucket 0 is unused, no need to add aⱼ⁰
+        continue
+      else:
+        buckets[b-1] ~*= elems[j]
+
+    # 2. Bucket reduction.                               Cost: 2x(2ᶜ-2) => 2 additions per 2ᶜ-1 bucket, last bucket is just copied
+    # We have ordered subset sums in each bucket, we now need to compute the mini-exponentiation
+    #   S₁¹ + S₂² + S₃³ + ... + (S₂c₋₁)^(2ᶜ-1)
+    var accumBuckets{.noInit.}, miniEXP{.noInit.}: Gt
+    accumBuckets = buckets[numBuckets-1]
+    miniEXP = buckets[numBuckets-1]
+
+    # Example with c = 3, 2³ = 8
+    for k in countdown(numBuckets-2, 0):
+      accumBuckets ~*= buckets[k] # Stores S₈ then S₈ +S₇ then S₈ +S₇ +S₆ then ...
+      miniEXP ~*= accumBuckets    # Stores S₈ then S₈²+S₇ then S₈³+S₇²+S₆ then ...
+
+    miniEXPs[w] = miniEXP
+
+  # 3. Final reduction.                                  Cost: (b/c - 1)x(c+1) => b/c windows, first is copied, c squarings + 1 mul per window
+  r = miniEXPs[numWindows-1]
+  for w in countdown(numWindows-2, 0):
+    for _ in 0 ..< c:
+      r.cyclotomic_square()
+    r ~*= miniEXPs[w]
+
+  # Cleanup
+  # -------
+  buckets.freeHeap()
+  miniEXPs.freeHeap()
+
+func multiExp_reference_dispatch_vartime[bits: static int, Gt](
+       r: var Gt,
+       elems: ptr UncheckedArray[Gt],
+       expos: ptr UncheckedArray[BigInt[bits]],
+       N: int) {.tags:[VarTime, HeapAlloc].} =
+  ## Multiexponentiation:
+  ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
+  let c = bestBucketBitSize(N, bits, useSignedBuckets = false, useManualTuning = false)
+
+  case c
+  of  2: multiExpImpl_reference_vartime(r, elems, expos, N, c =  2)
+  of  3: multiExpImpl_reference_vartime(r, elems, expos, N, c =  3)
+  of  4: multiExpImpl_reference_vartime(r, elems, expos, N, c =  4)
+  of  5: multiExpImpl_reference_vartime(r, elems, expos, N, c =  5)
+  of  6: multiExpImpl_reference_vartime(r, elems, expos, N, c =  6)
+  of  7: multiExpImpl_reference_vartime(r, elems, expos, N, c =  7)
+  of  8: multiExpImpl_reference_vartime(r, elems, expos, N, c =  8)
+  of  9: multiExpImpl_reference_vartime(r, elems, expos, N, c =  9)
+  of 10: multiExpImpl_reference_vartime(r, elems, expos, N, c = 10)
+  of 11: multiExpImpl_reference_vartime(r, elems, expos, N, c = 11)
+  of 12: multiExpImpl_reference_vartime(r, elems, expos, N, c = 12)
+  of 13: multiExpImpl_reference_vartime(r, elems, expos, N, c = 13)
+  of 14: multiExpImpl_reference_vartime(r, elems, expos, N, c = 14)
+  of 15: multiExpImpl_reference_vartime(r, elems, expos, N, c = 15)
+
+  of 16..20: multiExpImpl_reference_vartime(r, elems, expos, N, c = 16)
+  else:
+    unreachable()
+
+func multiExp_reference_vartime*[bits: static int, Gt](
+       r: var Gt,
+       elems: ptr UncheckedArray[Gt],
+       expos: ptr UncheckedArray[BigInt[bits]],
+       N: int) {.tags:[VarTime, HeapAlloc].} =
+  ## Multiexponentiation:
+  ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
+  multiExp_reference_dispatch_vartime(r, elems, expos, N)
+
+func multiExp_reference_vartime*[Gt](r: var Gt, elems: openArray[Gt], expos: openArray[BigInt]) {.tags:[VarTime, HeapAlloc].} =
+  ## Multiexponentiation:
+  ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
+  debug: doAssert expos.len == elems.len
+  let N = elems.len
+  multiExp_reference_dispatch_vartime(r, elems.asUnchecked(), expos.asUnchecked(), N)
+
+func multiExp_reference_vartime*[F, Gt](
+       r: var Gt,
+       elems: ptr UncheckedArray[Gt],
+       expos: ptr UncheckedArray[F],
+       len: int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
+  ## Multiexponentiation:
+  ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
+  let n = cast[int](len)
+  let expos_big = allocHeapArrayAligned(F.getBigInt(), n, alignment = 64)
+  expos_big.batchFromField(expos, n)
+  r.multiExp_reference_vartime(elems, expos_big, n)
+
+  freeHeapAligned(expos_big)
+
+func multiExp_reference_vartime*[Gt](
+       r: var Gt,
+       elems: openArray[Gt],
+       expos: openArray[Fr]) {.tags:[VarTime, Alloca, HeapAlloc], inline.} =
+  ## Multiexponentiation:
+  ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
+  debug: doAssert expos.len == elems.len
+  let N = elems.len
+  multiExp_reference_vartime(r, elems.asUnchecked(), expos.asUnchecked(), N)
+
+# ########################################################### #
+#                                                             #
+#                 Multi-exponentiations in 𝔾ₜ                 #
+#                     Optimized version                       #
+#                                                             #
+# ########################################################### #
+
+
+func accumulate[GT](buckets: ptr UncheckedArray[GT], val: SecretWord, negate: SecretBool, elem: GT) {.inline, meter.} =
+  let val = BaseType(val)
+  if val == 0: # Skip g⁰
+    return
+  elif negate.bool:
+    buckets[val-1] ~/= elem
+  else:
+    buckets[val-1] ~*= elem
+
+func bucketReduce[GT](r: var GT, buckets: ptr UncheckedArray[GT], numBuckets: static int) {.meter.} =
+  # We interleave reduction with one-ing the bucket to use instruction-level parallelism
+
+  var accumBuckets{.noInit.}: typeof(r)
+  accumBuckets = buckets[numBuckets-1]
+  r = buckets[numBuckets-1]
+  buckets[numBuckets-1].setNeutral()
+
+  for k in countdown(numBuckets-2, 0):
+    accumBuckets ~*= buckets[k]
+    r ~*= accumBuckets
+    buckets[k].setNeutral()
+
+type MiniMultiExpKind* = enum
+  kTopWindow
+  kFullWindow
+  kBottomWindow
+
+func bucketAccumReduce[bits: static int, GT](
+       r: var GT,
+       buckets: ptr UncheckedArray[GT],
+       bitIndex: int, miniMultiExpKind: static MiniMultiExpKind, c: static int,
+       elems: ptr UncheckedArray[GT], expos: ptr UncheckedArray[BigInt[bits]], N: int) =
+
+  const excess = bits mod c
+  const top = bits - excess
+
+  # 1. Bucket Accumulation
+  var curVal, nextVal: SecretWord
+  var curNeg, nextNeg: SecretBool
+
+  template getSignedWindow(j : int): tuple[val: SecretWord, neg: SecretBool] =
+    when miniMultiExpKind == kBottomWindow: expos[j].getSignedBottomWindow(c)
+    elif miniMultiExpKind == kTopWindow:    expos[j].getSignedTopWindow(top, excess)
+    else:                                   expos[j].getSignedFullWindowAt(bitIndex, c)
+
+  (curVal, curNeg) = getSignedWindow(0)
+  for j in 0 ..< N-1:
+    (nextVal, nextNeg) = getSignedWindow(j+1)
+    if nextVal.BaseType != 0:
+      # In cryptography, points are indistinguishable from random
+      # hence, without prefetching, accessing the next bucket is a guaranteed cache miss
+      prefetchLarge(buckets[nextVal.BaseType-1].addr, Write, HighTemporalLocality, maxCacheLines = 2)
+    buckets.accumulate(curVal, curNeg, elems[j])
+    curVal = nextVal
+    curNeg = nextNeg
+  buckets.accumulate(curVal, curNeg, elems[N-1])
+
+  # 2. Bucket Reduction
+  r.bucketReduce(buckets, numBuckets = 1 shl (c-1))
+
+func miniMultiExp[bits: static int, GT](
+       r: var GT,
+       buckets: ptr UncheckedArray[GT],
+       bitIndex: int, miniMultiExpKind: static MiniMultiExpKind, c: static int,
+       elems: ptr UncheckedArray[GT], expos: ptr UncheckedArray[BigInt[bits]], N: int) {.meter.} =
+  ## Apply a mini-Multi-Exponentiation on [bitIndex, bitIndex+window)
+  ## slice of all (coef, point) pairs
+
+  var windowProd{.noInit.}: typeof(r)
+  windowProd.bucketAccumReduce(
+    buckets, bitIndex, miniMultiExpKind, c,
+    elems, expos, N)
+
+  # 3. Mini-MultiExp on the slice [bitIndex, bitIndex+window)
+  r ~*= windowProd
+  when miniMultiExpKind != kBottomWindow:
+    for _ in 0 ..< c:
+      r.cyclotomic_square()
+
+func multiExpImpl_vartime[bits: static int, GT](
+       r: var GT,
+       elems: ptr UncheckedArray[GT], expos: ptr UncheckedArray[BigInt[bits]],
+       N: int, c: static int) {.tags:[VarTime, HeapAlloc], meter.} =
+  ## Multiexponentiation:
+  ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
+
+  # Setup
+  # -----
+  const numBuckets = 1 shl (c-1)
+
+  let buckets = allocHeapArray(GT, numBuckets)
+  for i in 0 ..< numBuckets:
+    buckets[i].setNeutral()
+
+  # Algorithm
+  # ---------
+  const excess = bits mod c
+  const top = bits - excess
+  var w = top
+  r.setNeutral()
+
+  when top != 0:      # Prologue
+    when excess != 0:
+      r.miniMultiExp(buckets, w, kTopWindow, c, elems, expos, N)
+      w -= c
+    else:
+      # If c divides bits exactly, the signed windowed recoding still needs to see an extra 0
+      # Since we did r.setNeutral() earlier, this is a no-op
+      discard
+
+  while w != 0:       # Steady state
+    r.miniMultiExp(buckets, w, kFullWindow, c, elems, expos, N)
+    w -= c
+
+  block:              # Epilogue
+    r.miniMultiExp(buckets, w, kBottomWindow, c, elems, expos, N)
+
+  # Cleanup
+  # -------
+  buckets.freeHeap()
+
+# Endomorphism acceleration
+# -----------------------------------------------------------------------------------------------------------------------
+
+proc applyEndomorphism[bits: static int, GT](
+       elems: ptr UncheckedArray[GT],
+       expos: ptr UncheckedArray[BigInt[bits]],
+       N: int): auto =
+  ## Decompose (elems, expos) into mini-scalars
+  ## Returns a new triplet (endoElems, endoExpos, N)
+  ## endoElems and endoExpos MUST be freed afterwards
+
+  const M = when Gt is Fp6:  2
+            elif Gt is Fp12: 4
+            else: {.error: "Unconfigured".}
+
+  const L = Fr[Gt.Name].bits().computeEndoRecodedLength(M)
+  let splitExpos   = allocHeapArray(array[M, BigInt[L]], N)
+  let endoBasis    = allocHeapArray(array[M, GT], N)
+
+  for i in 0 ..< N:
+    var negateElems {.noinit.}: array[M, SecretBool]
+    splitExpos[i].decomposeEndo(negateElems, expos[i], Fr[Gt.Name].bits(), Gt.Name, G2) # 𝔾ₜ has same decomposition as 𝔾₂
+    if negateElems[0].bool:
+      endoBasis[i][0].cyclotomic_inv(elems[i])
+    else:
+      endoBasis[i][0] = elems[i]
+
+    cast[ptr array[M-1, GT]](endoBasis[i][1].addr)[].computeEndomorphisms(elems[i])
+    for m in 1 ..< M:
+      if negateElems[m].bool:
+        endoBasis[i][m].cyclotomic_inv()
+
+  let endoElems  = cast[ptr UncheckedArray[GT]](endoBasis)
+  let endoExpos = cast[ptr UncheckedArray[BigInt[L]]](splitExpos)
+
+  return (endoElems, endoExpos, M*N)
+
+template withEndo[exponentsBits: static int, GT](
+           multiExpProc: untyped,
+           r: var GT,
+           elems: ptr UncheckedArray[GT],
+           expos: ptr UncheckedArray[BigInt[exponentsBits]],
+           N: int, c: static int) =
+  when Gt.Name.hasEndomorphismAcceleration() and
+        EndomorphismThreshold <= exponentsBits and
+        exponentsBits <= Fr[Gt.Name].bits():
+    let (endoElems, endoExpos, endoN) = applyEndomorphism(elems, expos, N)
+    # Given that bits and N changed, we are able to use a bigger `c`
+    # TODO: bench
+    multiExpProc(r, endoElems, endoExpos, endoN, c)
+    freeHeap(endoElems)
+    freeHeap(endoExpos)
+  else:
+    multiExpProc(r, elems, expos, N, c)
+
+# Algorithm selection
+# -----------------------------------------------------------------------------------------------------------------------
+
+func multiexp_dispatch_vartime[bits: static int, GT](
+       r: var GT,
+       elems: ptr UncheckedArray[GT],
+       expos: ptr UncheckedArray[BigInt[bits]], N: int) =
+  ## Multiexponentiation:
+  ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
+  let c = bestBucketBitSize(N, bits, useSignedBuckets = true, useManualTuning = true)
+
+  # Given that bits and N change after applying an endomorphism,
+  # we are able to use a bigger `c`
+  # TODO: benchmark
+
+  case c
+  of  2: withEndo(multiExpImpl_vartime, r, elems, expos, N, c =  2)
+  of  3: withEndo(multiExpImpl_vartime, r, elems, expos, N, c =  3)
+  of  4: withEndo(multiExpImpl_vartime, r, elems, expos, N, c =  4)
+  of  5: withEndo(multiExpImpl_vartime, r, elems, expos, N, c =  5)
+  of  6: withEndo(multiExpImpl_vartime, r, elems, expos, N, c =  6)
+  of  7: withEndo(multiExpImpl_vartime, r, elems, expos, N, c =  7)
+  of  8: withEndo(multiExpImpl_vartime, r, elems, expos, N, c =  8)
+  of  9: withEndo(multiExpImpl_vartime, r, elems, expos, N, c =  9)
+  of 10: withEndo(multiExpImpl_vartime, r, elems, expos, N, c = 10)
+  of 11: withEndo(multiExpImpl_vartime, r, elems, expos, N, c = 11)
+  of 12: withEndo(multiExpImpl_vartime, r, elems, expos, N, c = 12)
+  of 13: withEndo(multiExpImpl_vartime, r, elems, expos, N, c = 13)
+  of 14: multiExpImpl_vartime(r, elems, expos, N, c = 14)
+  of 15: multiExpImpl_vartime(r, elems, expos, N, c = 15)
+
+  of 16..17: multiExpImpl_vartime(r, elems, expos, N, c = 16)
+  else:
+    unreachable()
+
+func multiExp_vartime*[bits: static int, GT](
+       r: var GT,
+       elems: ptr UncheckedArray[GT],
+       expos: ptr UncheckedArray[BigInt[bits]],
+       len: int) {.tags:[VarTime, Alloca, HeapAlloc], meter, inline.} =
+  ## Multiexponentiation:
+  ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
+  multiExp_dispatch_vartime(r, elems, expos, len)
+
+func multiExp_vartime*[bits: static int, GT](
+       r: var GT,
+       elems: openArray[GT],
+       expos: openArray[BigInt[bits]]) {.tags:[VarTime, Alloca, HeapAlloc], meter, inline.} =
+  ## Multiexponentiation:
+  ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
+  debug: doAssert elems.len == expos.len
+  let N = elems.len
+  multiExp_dispatch_vartime(r, elems.asUnchecked(), expos.asUnchecked(), N)
+
+func multiExp_vartime*[F, GT](
+       r: var GT,
+       elems: ptr UncheckedArray[GT],
+       expos: ptr UncheckedArray[F],
+       len: int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} =
+  ## Multiexponentiation:
+  ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
+  let n = cast[int](len)
+  let expos_big = allocHeapArrayAligned(F.getBigInt(), n, alignment = 64)
+  expos_big.batchFromField(expos, n)
+  r.multiExp_vartime(elems, expos_big, n)
+
+  freeHeapAligned(expos_big)
+
+func multiExp_vartime*[GT](
+       r: var GT,
+       elems: openArray[GT],
+       expos: openArray[Fr]) {.tags:[VarTime, Alloca, HeapAlloc], inline.} =
+  ## Multiexponentiation:
+  ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
+  debug: doAssert elems.len == expos.len
+  let N = elems.len
+  multiExp_vartime(r, elems.asUnchecked(), expos.asUnchecked(), N)
diff --git a/constantine/math/pairings/gt_multiexp_parallel.nim b/constantine/math/pairings/gt_multiexp_parallel.nim
new file mode 100644
index 000000000..cb81c6138
--- /dev/null
+++ b/constantine/math/pairings/gt_multiexp_parallel.nim
@@ -0,0 +1,258 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import constantine/named/algebras,
+       constantine/math/endomorphisms/split_scalars,
+       constantine/math/extension_fields,
+       constantine/math/arithmetic,
+       constantine/named/zoo_endomorphisms,
+       constantine/platforms/abstractions,
+       ./cyclotomic_subgroups,
+       constantine/threadpool
+
+import ./gt_multiexp {.all.}
+
+# No exceptions allowed in core cryptographic operations
+{.push raises: [].}
+{.push checks: off.}
+
+# ########################################################### #
+#                                                             #
+#             Multi-Exponentiation in 𝔾ₜ                      #
+#                                                             #
+# ########################################################### #
+
+proc bucketAccumReduce_withInit[bits: static int, GT](
+       windowProd: ptr GT,
+       buckets: ptr GT or ptr UncheckedArray[GT],
+       bitIndex: int,  miniMultiExpKind: static MiniMultiExpKind, c: static int,
+       elems: ptr UncheckedArray[GT], expos: ptr UncheckedArray[BigInt[bits]], N: int) =
+  const numBuckets = 1 shl (c-1)
+  let buckets = cast[ptr UncheckedArray[GT]](buckets)
+  for i in 0 ..< numBuckets:
+    buckets[i].setNeutral()
+  bucketAccumReduce(windowProd[], buckets, bitIndex, miniMultiExpKind, c, elems, expos, N)
+
+proc multiexpImpl_vartime_parallel[bits: static int, GT](
+       tp: Threadpool,
+       r: ptr GT,
+       elems: ptr UncheckedArray[GT], expos: ptr UncheckedArray[BigInt[bits]],
+       N: int, c: static int) =
+
+  # Prologue
+  # --------
+  const numBuckets = 1 shl (c-1)
+  const numFullWindows = bits div c
+  const numWindows = numFullWindows + 1 # Even if `bits div c` is exact, the signed recoding needs to see an extra 0 after the MSB
+
+  # Instead of storing the result in futures, risking them being scattered in memory
+  # we store them in a contiguous array, and the synchronizing future just returns a bool.
+  # top window is done on this thread
+  let miniMultiExpsResults = allocHeapArray(GT, numFullWindows)
+  let miniMultiExpsReady   = allocStackArray(FlowVar[bool], numFullWindows)
+
+  let bucketsMatrix = allocHeapArray(GT, numBuckets*numWindows)
+
+  # Algorithm
+  # ---------
+
+  block: # 1. Bucket accumulation and reduction
+    miniMultiExpsReady[0] = tp.spawnAwaitable bucketAccumReduce_withInit(
+                                  miniMultiExpsResults[0].addr,
+                                  bucketsMatrix[0].addr,
+                                  bitIndex = 0, kBottomWindow, c,
+                                  elems, expos, N)
+
+  for w in 1 ..< numFullWindows:
+    miniMultiExpsReady[w] = tp.spawnAwaitable bucketAccumReduce_withInit(
+                                  miniMultiExpsResults[w].addr,
+                                  bucketsMatrix[w*numBuckets].addr,
+                                  bitIndex = w*c, kFullWindow, c,
+                                  elems, expos, N)
+
+  # Last window is done sync on this thread, directly initializing r
+  const excess = bits mod c
+  const top = bits-excess
+
+  when top != 0:
+    when excess != 0:
+      bucketAccumReduce_withInit(
+        r,
+        bucketsMatrix[numFullWindows*numBuckets].addr,
+        bitIndex = top, kTopWindow, c,
+        elems, expos, N)
+    else:
+      r[].setNeutral()
+
+  # 3. Final reduction, r initialized to what would be miniMSMsReady[numWindows-1]
+  when excess != 0:
+    for w in countdown(numWindows-2, 0):
+      for _ in 0 ..< c:
+        r[].cyclotomic_square()
+      discard sync miniMultiExpsReady[w]
+      r[] ~*= miniMultiExpsResults[w]
+  elif numWindows >= 2:
+    discard sync miniMultiExpsReady[numWindows-2]
+    r[] = miniMultiExpsResults[numWindows-2]
+    for w in countdown(numWindows-3, 0):
+      for _ in 0 ..< c:
+        r[].cyclotomic_square()
+      discard sync miniMultiExpsReady[w]
+      r[] ~*= miniMultiExpsResults[w]
+
+  # Cleanup
+  # -------
+  miniMultiExpsResults.freeHeap()
+  bucketsMatrix.freeHeap()
+
+# Endomorphism acceleration
+# -----------------------------------------------------------------------------------------------------------------------
+
+proc applyEndomorphism_parallel[bits: static int, GT](
+       tp: Threadpool,
+       elems: ptr UncheckedArray[GT],
+       expos: ptr UncheckedArray[BigInt[bits]],
+       N: int): auto =
+  ## Decompose (elems, expos) into mini-scalars
+  ## Returns a new triplet (endoElems, endoExpos, N)
+  ## endoElems and endoExpos MUST be freed afterwards
+
+  const M = when Gt is Fp6:  2
+            elif Gt is Fp12: 4
+            else: {.error: "Unconfigured".}
+
+  const L = Fr[Gt.Name].bits().computeEndoRecodedLength(M)
+  let splitExpos   = allocHeapArray(array[M, BigInt[L]], N)
+  let endoBasis    = allocHeapArray(array[M, GT], N)
+
+  syncScope:
+    tp.parallelFor i in 0 ..< N:
+      captures: {elems, expos, splitExpos, endoBasis}
+
+      var negateElems {.noinit.}: array[M, SecretBool]
+      splitExpos[i].decomposeEndo(negateElems, expos[i], Fr[Gt.Name].bits(), Gt.Name, G2) # 𝔾ₜ has same decomposition as 𝔾₂
+      if negateElems[0].bool:
+        endoBasis[i][0].cyclotomic_inv(elems[i])
+      else:
+        endoBasis[i][0] = elems[i]
+
+      cast[ptr array[M-1, GT]](endoBasis[i][1].addr)[].computeEndomorphisms(elems[i])
+      for m in 1 ..< M:
+        if negateElems[m].bool:
+          endoBasis[i][m].cyclotomic_inv()
+
+  let endoElems  = cast[ptr UncheckedArray[GT]](endoBasis)
+  let endoExpos = cast[ptr UncheckedArray[BigInt[L]]](splitExpos)
+
+  return (endoElems, endoExpos, M*N)
+
+template withEndo[exponentsBits: static int, GT](
+           multiExpProc: untyped,
+           tp: Threadpool,
+           r: ptr GT,
+           elems: ptr UncheckedArray[GT],
+           expos: ptr UncheckedArray[BigInt[exponentsBits]],
+           N: int, c: static int) =
+  when Gt.Name.hasEndomorphismAcceleration() and
+        EndomorphismThreshold <= exponentsBits and
+        exponentsBits <= Fr[Gt.Name].bits():
+    let (endoElems, endoExpos, endoN) = applyEndomorphism_parallel(tp, elems, expos, N)
+    # Given that bits and N changed, we are able to use a bigger `c`
+    # TODO: bench
+    multiExpProc(tp, r, endoElems, endoExpos, endoN, c)
+    freeHeap(endoElems)
+    freeHeap(endoExpos)
+  else:
+    multiExpProc(tp, r, elems, expos, N, c)
+
+# Algorithm selection
+# -----------------------------------------------------------------------------------------------------------------------
+
+proc multiexp_dispatch_vartime_parallel[bits: static int, GT](
+       tp: Threadpool,
+       r: ptr GT,
+       elems: ptr UncheckedArray[GT],
+       expos: ptr UncheckedArray[BigInt[bits]], N: int) =
+  ## Multiexponentiation:
+  ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
+  let c = bestBucketBitSize(N, bits, useSignedBuckets = true, useManualTuning = true)
+
+  # Given that bits and N change after applying an endomorphism,
+  # we are able to use a bigger `c`
+  # TODO: benchmark
+
+  case c
+  of  2: withEndo(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c =  2)
+  of  3: withEndo(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c =  3)
+  of  4: withEndo(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c =  4)
+  of  5: withEndo(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c =  5)
+  of  6: withEndo(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c =  6)
+  of  7: withEndo(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c =  7)
+  of  8: withEndo(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c =  8)
+  of  9: withEndo(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c =  9)
+  of 10: withEndo(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 10)
+  of 11: withEndo(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 11)
+  of 12: withEndo(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 12)
+  of 13: withEndo(multiExpImpl_vartime_parallel, tp, r, elems, expos, N, c = 13)
+  of 14: multiExpImpl_vartime_parallel(tp, r, elems, expos, N, c = 14)
+  of 15: multiExpImpl_vartime_parallel(tp, r, elems, expos, N, c = 15)
+
+  of 16..17: multiExpImpl_vartime_parallel(tp, r, elems, expos, N, c = 16)
+  else:
+    unreachable()
+
+proc multiExp_vartime_parallel*[bits: static int, GT](
+       tp: Threadpool,
+       r: ptr GT,
+       elems: ptr UncheckedArray[GT],
+       expos: ptr UncheckedArray[BigInt[bits]],
+       len: int) {.meter, inline.} =
+  ## Multiexponentiation:
+  ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
+  tp.multiExp_dispatch_vartime_parallel(r, elems, expos, len)
+
+proc multiExp_vartime_parallel*[bits: static int, GT](
+       tp: Threadpool,
+       r: var GT,
+       elems: openArray[GT],
+       expos: openArray[BigInt[bits]]) {.meter, inline.} =
+  ## Multiexponentiation:
+  ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
+  debug: doAssert elems.len == expos.len
+  let N = elems.len
+  tp.multiExp_dispatch_vartime_parallel(r.addr, elems.asUnchecked(), expos.asUnchecked(), N)
+
+proc multiExp_vartime_parallel*[F, GT](
+       tp: Threadpool,
+       r: ptr GT,
+       elems: ptr UncheckedArray[GT],
+       expos: ptr UncheckedArray[F],
+       len: int) {.meter.} =
+  ## Multiexponentiation:
+  ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
+  let n = cast[int](len)
+  let expos_big = allocHeapArrayAligned(F.getBigInt(), n, alignment = 64)
+
+  syncScope:
+    tp.parallelFor i in 0 ..< n:
+      captures: {expos, expos_big}
+      expos_big[i].fromField(expos[i])
+  tp.multiExp_vartime_parallel(r, elems, expos_big, n)
+
+  freeHeapAligned(expos_big)
+
+proc multiExp_vartime_parallel*[GT](
+       tp: Threadpool,
+       r: var GT,
+       elems: openArray[GT],
+       expos: openArray[Fr]) {.meter, inline.} =
+  ## Multiexponentiation:
+  ##   r <- g₀^a₀ + g₁^a₁ + ... + gₙ^aₙ
+  debug: doAssert elems.len == expos.len
+  let N = elems.len
+  tp.multiExp_vartime_parallel(r.addr, elems.asUnchecked(), expos.asUnchecked(), N)
diff --git a/constantine/math/pairings/pairings_generic.nim b/constantine/math/pairings/pairings_generic.nim
index a8a79f696..df95b44e6 100644
--- a/constantine/math/pairings/pairings_generic.nim
+++ b/constantine/math/pairings/pairings_generic.nim
@@ -27,11 +27,16 @@ func millerLoop*[Name: static Algebra](gt: var Fp12[Name], Q, P: auto, n: int) {
   else:
     gt.millerLoopAddchain(Q, P, n)
 
-func finalExp*[Name: static Algebra](gt: var Fp12[Name]){.inline.} =
-  gt.finalExpEasy()
+export finalExpEasy
+
+func finalExpHard*[Name: static Algebra](gt: var Fp12[Name]) {.inline.} =
   when family(Name) == BarretoNaehrig:
     gt.finalExpHard_BN()
   elif family(Name) == BarretoLynnScott:
     gt.finalExpHard_BLS12()
   else:
     {.error: "Final Exponentiation not implemented for " & $Name.}
+
+func finalExp*[Name: static Algebra](gt: var Fp12[Name]){.inline.} =
+  gt.finalExpEasy()
+  gt.finalExpHard()
diff --git a/tests/math_pairings/t_pairing_bls12_377_optate.nim b/tests/math_pairings/t_pairing_bls12_377_optate.nim
index d6cbe09ee..87f2cbd75 100644
--- a/tests/math_pairings/t_pairing_bls12_377_optate.nim
+++ b/tests/math_pairings/t_pairing_bls12_377_optate.nim
@@ -6,14 +6,11 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import
-  constantine/math/pairings/pairings_bls12,
-  # Test utilities
-  ./t_pairing_template
+import ./t_pairing_template
 
 runPairingTests(
-  4, BLS12_377,
+  BLS12_377,
   G1 = EC_ShortW_Prj[Fp[BLS12_377], G1],
   G2 = EC_ShortW_Prj[Fp2[BLS12_377], G2],
   GT = Fp12[BLS12_377],
-  pairing_bls12)
+  iters = 4)
diff --git a/tests/math_pairings/t_pairing_bls12_381_gt_exp.nim b/tests/math_pairings/t_pairing_bls12_381_gt_exp.nim
index 2309c628c..9f65921d1 100644
--- a/tests/math_pairings/t_pairing_bls12_381_gt_exp.nim
+++ b/tests/math_pairings/t_pairing_bls12_381_gt_exp.nim
@@ -10,6 +10,4 @@ import
   # Test utilities
   ./t_pairing_template
 
-runGTexponentiationTests(
-  Iters = 4,
-  GT = Fp12[BLS12_381])
+runGTexponentiationTests(GT = Fp12[BLS12_381], iters =4)
diff --git a/tests/math_pairings/t_pairing_bls12_381_gt_multiexp.nim b/tests/math_pairings/t_pairing_bls12_381_gt_multiexp.nim
new file mode 100644
index 000000000..3a95462c9
--- /dev/null
+++ b/tests/math_pairings/t_pairing_bls12_381_gt_multiexp.nim
@@ -0,0 +1,18 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Test utilities
+  ./t_pairing_template
+
+const numPoints = [1, 2, 8, 16, 128, 256, 1024]
+
+runGTmultiexpTests(
+  GT = Fp12[BLS12_381],
+  numPoints,
+  iters = 4)
diff --git a/tests/math_pairings/t_pairing_bls12_381_gt_subgroup.nim b/tests/math_pairings/t_pairing_bls12_381_gt_subgroup.nim
index 371ef6226..31ea0f87d 100644
--- a/tests/math_pairings/t_pairing_bls12_381_gt_subgroup.nim
+++ b/tests/math_pairings/t_pairing_bls12_381_gt_subgroup.nim
@@ -6,12 +6,6 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import
-  constantine/math/pairings/pairings_bls12,
-  # Test utilities
-  ./t_pairing_template
+import ./t_pairing_template
 
-runGTsubgroupTests(
-  Iters = 4,
-  GT = Fp12[BLS12_381],
-  finalExpHard_BLS12)
+runGTsubgroupTests(GT = Fp12[BLS12_381], iters = 4)
diff --git a/tests/math_pairings/t_pairing_bls12_381_optate.nim b/tests/math_pairings/t_pairing_bls12_381_optate.nim
index eda18b2e9..b75fa67bc 100644
--- a/tests/math_pairings/t_pairing_bls12_381_optate.nim
+++ b/tests/math_pairings/t_pairing_bls12_381_optate.nim
@@ -6,14 +6,11 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import
-  constantine/math/pairings/pairings_bls12,
-  # Test utilities
-  ./t_pairing_template
+import ./t_pairing_template
 
 runPairingTests(
-  4, BLS12_381,
+  BLS12_381,
   G1 = EC_ShortW_Prj[Fp[BLS12_381], G1],
   G2 = EC_ShortW_Prj[Fp2[BLS12_381], G2],
   GT = Fp12[BLS12_381],
-  pairing_bls12)
+  iters = 4)
diff --git a/tests/math_pairings/t_pairing_bn254_nogami_gt_subgroup.nim b/tests/math_pairings/t_pairing_bn254_nogami_gt_subgroup.nim
index 0838afd0f..71b9a8ad2 100644
--- a/tests/math_pairings/t_pairing_bn254_nogami_gt_subgroup.nim
+++ b/tests/math_pairings/t_pairing_bn254_nogami_gt_subgroup.nim
@@ -6,12 +6,6 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import
-  constantine/math/pairings/pairings_bn,
-  # Test utilities
-  ./t_pairing_template
+import ./t_pairing_template
 
-runGTsubgroupTests(
-  Iters = 4,
-  GT = Fp12[BN254_Nogami],
-  finalExpHard_BN)
+runGTsubgroupTests(GT = Fp12[BN254_Nogami], iters = 4)
diff --git a/tests/math_pairings/t_pairing_bn254_nogami_optate.nim b/tests/math_pairings/t_pairing_bn254_nogami_optate.nim
index 6c72eea58..d5b3fc755 100644
--- a/tests/math_pairings/t_pairing_bn254_nogami_optate.nim
+++ b/tests/math_pairings/t_pairing_bn254_nogami_optate.nim
@@ -6,14 +6,11 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import
-  constantine/math/pairings/pairings_bn,
-  # Test utilities
-  ./t_pairing_template
+import ./t_pairing_template
 
 runPairingTests(
-  4, BN254_Nogami,
+  BN254_Nogami,
   G1 = EC_ShortW_Prj[Fp[BN254_Nogami], G1],
   G2 = EC_ShortW_Prj[Fp2[BN254_Nogami], G2],
   GT = Fp12[BN254_Nogami],
-  pairing_bn)
+  iters = 4)
diff --git a/tests/math_pairings/t_pairing_bn254_snarks_gt_exp.nim b/tests/math_pairings/t_pairing_bn254_snarks_gt_exp.nim
index dea4efd8b..38d8cc3f4 100644
--- a/tests/math_pairings/t_pairing_bn254_snarks_gt_exp.nim
+++ b/tests/math_pairings/t_pairing_bn254_snarks_gt_exp.nim
@@ -10,6 +10,4 @@ import
   # Test utilities
   ./t_pairing_template
 
-runGTexponentiationTests(
-  Iters = 4,
-  GT = Fp12[BN254_Nogami])
+runGTexponentiationTests(GT = Fp12[BN254_Nogami], iters = 4)
diff --git a/tests/math_pairings/t_pairing_bn254_snarks_gt_multiexp.nim b/tests/math_pairings/t_pairing_bn254_snarks_gt_multiexp.nim
new file mode 100644
index 000000000..d9e746a95
--- /dev/null
+++ b/tests/math_pairings/t_pairing_bn254_snarks_gt_multiexp.nim
@@ -0,0 +1,18 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Test utilities
+  ./t_pairing_template
+
+const numPoints = [1, 2, 8, 16, 128, 256, 1024]
+
+runGTmultiexpTests(
+  GT = Fp12[BN254_Snarks],
+  numPoints,
+  iters = 4)
diff --git a/tests/math_pairings/t_pairing_bn254_snarks_gt_subgroup.nim b/tests/math_pairings/t_pairing_bn254_snarks_gt_subgroup.nim
index 7a3002de8..d78797797 100644
--- a/tests/math_pairings/t_pairing_bn254_snarks_gt_subgroup.nim
+++ b/tests/math_pairings/t_pairing_bn254_snarks_gt_subgroup.nim
@@ -6,12 +6,6 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import
-  constantine/math/pairings/pairings_bn,
-  # Test utilities
-  ./t_pairing_template
+import ./t_pairing_template
 
-runGTsubgroupTests(
-  Iters = 4,
-  GT = Fp12[BN254_Snarks],
-  finalExpHard_BN)
+runGTsubgroupTests(GT = Fp12[BN254_Snarks], iters = 4)
diff --git a/tests/math_pairings/t_pairing_bn254_snarks_optate.nim b/tests/math_pairings/t_pairing_bn254_snarks_optate.nim
index 59635f766..6cbd21ea0 100644
--- a/tests/math_pairings/t_pairing_bn254_snarks_optate.nim
+++ b/tests/math_pairings/t_pairing_bn254_snarks_optate.nim
@@ -6,14 +6,11 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import
-  constantine/math/pairings/pairings_bn,
-  # Test utilities
-  ./t_pairing_template
+import ./t_pairing_template
 
 runPairingTests(
-  4, BN254_Snarks,
+  BN254_Snarks,
   G1 = EC_ShortW_Prj[Fp[BN254_Snarks], G1],
   G2 = EC_ShortW_Prj[Fp2[BN254_Snarks], G2],
   GT = Fp12[BN254_Snarks],
-  pairing_bn)
+  iters = 4)
diff --git a/tests/math_pairings/t_pairing_bw6_761_gt_subgroup.nim b/tests/math_pairings/t_pairing_bw6_761_gt_subgroup.nim
index 7be0753b8..4a5e72c10 100644
--- a/tests/math_pairings/t_pairing_bw6_761_gt_subgroup.nim
+++ b/tests/math_pairings/t_pairing_bw6_761_gt_subgroup.nim
@@ -6,12 +6,6 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import
-  constantine/math/pairings/pairings_bw6_761,
-  # Test utilities
-  ./t_pairing_template
+import ./t_pairing_template
 
-runGTsubgroupTests(
-  Iters = 4,
-  GT = Fp6[BW6_761],
-  finalExpHard_BW6_761)
+runGTsubgroupTests(GT = Fp6[BW6_761], iters = 4)
diff --git a/tests/math_pairings/t_pairing_bw6_761_optate.nim b/tests/math_pairings/t_pairing_bw6_761_optate.nim
index 34870bed3..7c92c115a 100644
--- a/tests/math_pairings/t_pairing_bw6_761_optate.nim
+++ b/tests/math_pairings/t_pairing_bw6_761_optate.nim
@@ -6,14 +6,11 @@
 #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
-import
-  constantine/math/pairings/pairings_bw6_761,
-  # Test utilities
-  ./t_pairing_template
+import ./t_pairing_template
 
 runPairingTests(
-  4, BW6_761,
+  BW6_761,
   G1 = EC_ShortW_Prj[Fp[BW6_761], G1],
   G2 = EC_ShortW_Prj[Fp[BW6_761], G2],
   GT = Fp6[BW6_761],
-  pairing_bw6_761_reference)
+  iters = 4)
diff --git a/tests/math_pairings/t_pairing_template.nim b/tests/math_pairings/t_pairing_template.nim
index 0c7382478..a19b777b4 100644
--- a/tests/math_pairings/t_pairing_template.nim
+++ b/tests/math_pairings/t_pairing_template.nim
@@ -14,12 +14,13 @@ import
   constantine/math/arithmetic,
   constantine/math/extension_fields,
   constantine/named/algebras,
-  constantine/named/zoo_subgroups,
+  constantine/named/[zoo_subgroups, zoo_pairings],
   constantine/math/elliptic/[ec_shortweierstrass_affine, ec_shortweierstrass_projective],
   constantine/math/pairings/[
     cyclotomic_subgroups,
     gt_exponentiations,
     gt_exponentiations_vartime,
+    gt_multiexp,
     pairings_generic],
   constantine/math/io/io_extfields,
 
@@ -27,13 +28,10 @@ import
   helpers/prng_unsafe
 
 export
-  prng_unsafe, times, unittest,
+  unittest, # Generic sandwich
   ec_shortweierstrass_affine, ec_shortweierstrass_projective,
-  arithmetic, extension_fields,
-  io_extfields,
-  cyclotomic_subgroups,
-  gt_exponentiations, gt_exponentiations_vartime,
-  abstractions, algebras
+  extension_fields,
+  algebras
 
 type
   RandomGen* = enum
@@ -64,7 +62,7 @@ func random_point*(rng: var RngState, EC: typedesc, randZ: bool, gen: RandomGen)
     result = rng.random_long01Seq(EC)
     result.clearCofactor()
 
-template runPairingTests*(Iters: static int, Name: static Algebra, G1, G2, GT: typedesc, pairing_fn: untyped): untyped {.dirty.}=
+proc runPairingTests*(Name: static Algebra, G1, G2, GT: typedesc, iters: int) =
   bind affineType
 
   var rng: RngState
@@ -73,8 +71,8 @@ template runPairingTests*(Iters: static int, Name: static Algebra, G1, G2, GT: t
   echo "\n------------------------------------------------------\n"
   echo "test_pairing_",$Name,"_optate xoshiro512** seed: ", timeseed
 
-  proc test_bilinearity_double_impl(randZ: bool, gen: RandomGen) =
-    for _ in 0 ..< Iters:
+  proc test_bilinearity_double_impl(randZ: bool, gen: RandomGen, iters: int) =
+    for _ in 0 ..< iters:
       let P = rng.random_point(G1, randZ, gen)
       let Q = rng.random_point(G2, randZ, gen)
       var P2: typeof(P)
@@ -93,10 +91,10 @@ template runPairingTests*(Iters: static int, Name: static Algebra, G1, G2, GT: t
       Qa.affine(Q)
       Qa2.affine(Q2)
 
-      r.pairing_fn(Pa, Qa)
+      r.pairing(Pa, Qa)
       r.square()
-      r2.pairing_fn(Pa2, Qa)
-      r3.pairing_fn(Pa, Qa2)
+      r2.pairing(Pa2, Qa)
+      r3.pairing(Pa, Qa2)
 
       doAssert bool(not r.isZero())
       doAssert bool(not r.isOne())
@@ -106,11 +104,11 @@ template runPairingTests*(Iters: static int, Name: static Algebra, G1, G2, GT: t
 
   suite "Pairing - Optimal Ate on " & $Name & " [" & $WordBitWidth & "-bit words]":
     test "Bilinearity e([2]P, Q) = e(P, [2]Q) = e(P, Q)^2":
-      test_bilinearity_double_impl(randZ = false, gen = Uniform)
-      test_bilinearity_double_impl(randZ = false, gen = HighHammingWeight)
-      test_bilinearity_double_impl(randZ = false, gen = Long01Sequence)
+      test_bilinearity_double_impl(randZ = false, gen = Uniform, iters)
+      test_bilinearity_double_impl(randZ = false, gen = HighHammingWeight, iters)
+      test_bilinearity_double_impl(randZ = false, gen = Long01Sequence, iters)
 
-func random_elem(rng: var RngState, F: typedesc, gen: RandomGen): F {.inline, noInit.} =
+func random_elem*(rng: var RngState, F: typedesc, gen: RandomGen): F {.noInit.} =
   if gen == Uniform:
     result = rng.random_unsafe(F)
   elif gen == HighHammingWeight:
@@ -118,7 +116,7 @@ func random_elem(rng: var RngState, F: typedesc, gen: RandomGen): F {.inline, no
   else:
     result = rng.random_long01Seq(F)
 
-template runGTsubgroupTests*(Iters: static int, GT: typedesc, finalExpHard_fn: untyped): untyped {.dirty.}=
+proc runGTsubgroupTests*(GT: typedesc, iters: int) =
   bind affineType, random_elem
 
   var rng: RngState
@@ -127,9 +125,9 @@ template runGTsubgroupTests*(Iters: static int, GT: typedesc, finalExpHard_fn: u
   echo "\n------------------------------------------------------\n"
   echo "test_pairing_",$GT.Name,"_gt xoshiro512** seed: ", timeseed
 
-  proc test_gt_impl(gen: RandomGen) =
+  proc test_gt_impl(gen: RandomGen, iters: int) =
     stdout.write "    "
-    for _ in 0 ..< Iters:
+    for _ in 0 ..< iters:
       let a = rng.random_elem(GT, gen)
       doAssert not bool a.isInCyclotomicSubgroup(), "The odds of generating randomly such an element are too low a: " & a.toHex()
       var a2 = a
@@ -137,7 +135,7 @@ template runGTsubgroupTests*(Iters: static int, GT: typedesc, finalExpHard_fn: u
       doAssert bool a2.isInCyclotomicSubgroup()
       doAssert not bool a2.isInPairingSubgroup(), "The odds of generating randomly such an element are too low a2: " & a.toHex()
       var a3 = a2
-      finalExpHard_fn(a3)
+      finalExpHard(a3)
       doAssert bool a3.isInCyclotomicSubgroup()
       doAssert bool a3.isInPairingSubgroup()
       stdout.write '.'
@@ -146,30 +144,24 @@ template runGTsubgroupTests*(Iters: static int, GT: typedesc, finalExpHard_fn: u
 
   suite "Pairing - 𝔾ₜ subgroup " & $GT.Name & " [" & $WordBitWidth & "-bit words]":
     test "Final Exponentiation and 𝔾ₜ-subgroup membership":
-      test_gt_impl(gen = Uniform)
-      test_gt_impl(gen = HighHammingWeight)
-      test_gt_impl(gen = Long01Sequence)
-
-func random_gt*(rng: var RngState, F: typedesc, gen: RandomGen): F {.inline, noInit.} =
-  if gen == Uniform:
-    result = rng.random_unsafe(F)
-  elif gen == HighHammingWeight:
-    result = rng.random_highHammingWeight(F)
-  else:
-    result = rng.random_long01Seq(F)
+      test_gt_impl(gen = Uniform, iters)
+      test_gt_impl(gen = HighHammingWeight, iters)
+      test_gt_impl(gen = Long01Sequence, iters)
 
+func random_gt(rng: var RngState, F: typedesc, gen: RandomGen): F {.noInit.} =
+  result = rng.random_elem(F, gen)
   result.finalExp()
 
-template runGTexponentiationTests*(Iters: static int, GT: typedesc): untyped {.dirty.} =
+proc runGTexponentiationTests*(GT: typedesc, iters: int) =
   var rng: RngState
   let timeseed = uint32(toUnix(getTime()) and (1'i64 shl 32 - 1)) # unixTime mod 2^32
   seed(rng, timeseed)
   echo "\n------------------------------------------------------\n"
   echo "test_pairing_",$GT.Name,"_gt_exponentiation xoshiro512** seed: ", timeseed
 
-  proc test_gt_exponentiation_impl(gen: RandomGen) =
+  proc test_gt_exponentiation_impl(gen: RandomGen, iters: int) =
     stdout.write "    "
-    for _ in 0 ..< Iters:
+    for _ in 0 ..< iters:
       let a = rng.random_gt(GT, gen)
       let kUnred = rng.random_long01seq(GT.Name.getBigInt(kScalarField))
       var k {.noInit.}: GT.Name.getBigInt(kScalarField)
@@ -219,6 +211,46 @@ template runGTexponentiationTests*(Iters: static int, GT: typedesc): untyped {.d
 
   suite "Pairing - Exponentiation for 𝔾ₜ " & $GT.Name & " [" & $WordBitWidth & "-bit words]":
     test "𝔾ₜ exponentiation consistency":
-      test_gt_exponentiation_impl(gen = Uniform)
-      test_gt_exponentiation_impl(gen = HighHammingWeight)
-      test_gt_exponentiation_impl(gen = Long01Sequence)
+      test_gt_exponentiation_impl(gen = Uniform, iters)
+      test_gt_exponentiation_impl(gen = HighHammingWeight, iters)
+      test_gt_exponentiation_impl(gen = Long01Sequence, iters)
+
+proc runGTmultiexpTests*[N: static int](GT: typedesc, num_points: array[N, int], iters: int) =
+  var rng: RngState
+  let timeseed = uint32(toUnix(getTime()) and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+  seed(rng, timeseed)
+  echo "\n------------------------------------------------------\n"
+  echo "test_pairing_",$GT.Name,"_gt_multiexp xoshiro512** seed: ", timeseed
+
+  proc test_gt_multiexp_impl[N](GT: typedesc, rng: var RngState, num_points: array[N, int], gen: RandomGen, iters: int) =
+    for N in num_points:
+      stdout.write "    "
+      for _ in 0 ..< iters:
+        var elems = newSeq[GT](N)
+        var exponents = newSeq[Fr[GT.Name]](N)
+
+        for i in 0 ..< N:
+          elems[i] = rng.random_gt(GT, gen)
+          exponents[i] = rng.random_elem(Fr[GT.Name], gen)
+
+        var naive: GT
+        naive.setOne()
+        for i in 0 ..< N:
+          var t {.noInit.}: GT
+          t.gtExp_vartime(elems[i], exponents[i])
+          naive *= t
+
+        var mexp_ref, mexp_opt: GT
+        mexp_ref.multiExp_reference_vartime(elems, exponents)
+        mexp_opt.multiExp_vartime(elems, exponents)
+
+        doAssert bool(naive == mexp_ref)
+        doAssert bool(naive == mexp_opt)
+
+        stdout.write '.'
+
+      stdout.write '\n'
+
+  suite "Pairing - MultiExponentiation for 𝔾ₜ " & $GT.Name & " [" & $WordBitWidth & "-bit words]":
+    test "𝔾ₜ multi-exponentiation consistency":
+      test_gt_multiexp_impl(GT, rng, num_points, gen = Long01Sequence, iters)
diff --git a/tests/parallel/t_ec_template_parallel.nim b/tests/parallel/t_ec_template_parallel.nim
index 06102c546..b6f1902f8 100644
--- a/tests/parallel/t_ec_template_parallel.nim
+++ b/tests/parallel/t_ec_template_parallel.nim
@@ -33,7 +33,7 @@ import
   # Test utilities
   helpers/prng_unsafe
 
-export unittest, abstractions, arithmetic # Generic sandwich
+export unittest, abstractions, arithmetic, ec_twistededwards_affine # Generic sandwich
 
 type
   RandomGen* = enum
diff --git a/tests/parallel/t_pairing_bls12_381_gt_multiexp_parallel.nim b/tests/parallel/t_pairing_bls12_381_gt_multiexp_parallel.nim
new file mode 100644
index 000000000..98fd820c4
--- /dev/null
+++ b/tests/parallel/t_pairing_bls12_381_gt_multiexp_parallel.nim
@@ -0,0 +1,18 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Test utilities
+  ./t_pairing_template_parallel
+
+const numPoints = [1, 2, 8, 16, 128, 256, 1024]
+
+runGTmultiexp_parallel_Tests(
+  GT = Fp12[BLS12_381],
+  numPoints,
+  Iters = 4)
diff --git a/tests/parallel/t_pairing_template_parallel.nim b/tests/parallel/t_pairing_template_parallel.nim
new file mode 100644
index 000000000..cfa6db6a4
--- /dev/null
+++ b/tests/parallel/t_pairing_template_parallel.nim
@@ -0,0 +1,88 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard library
+  std/unittest, times,
+  # Internals
+  constantine/platforms/abstractions,
+  constantine/math/extension_fields,
+  constantine/named/algebras,
+  constantine/math/pairings/[
+    gt_exponentiations_vartime,
+    gt_multiexp_parallel,
+    pairings_generic],
+  constantine/threadpool,
+
+  # Test utilities
+  helpers/prng_unsafe
+
+export
+  unittest,     # generic sandwich
+  abstractions, # generic sandwich
+  extension_fields,
+  algebras
+
+type
+  RandomGen* = enum
+    Uniform
+    HighHammingWeight
+    Long01Sequence
+
+func random_elem(rng: var RngState, F: typedesc, gen: RandomGen): F {.noInit.} =
+  if gen == Uniform:
+    result = rng.random_unsafe(F)
+  elif gen == HighHammingWeight:
+    result = rng.random_highHammingWeight(F)
+  else:
+    result = rng.random_long01Seq(F)
+
+func random_gt(rng: var RngState, F: typedesc, gen: RandomGen): F {.noInit.} =
+  result = rng.random_elem(F, gen)
+  result.finalExp()
+
+proc runGTmultiexp_parallel_Tests*[N: static int](GT: typedesc, num_points: array[N, int], Iters: int) =
+  var rng: RngState
+  let timeseed = uint32(toUnix(getTime()) and (1'i64 shl 32 - 1)) # unixTime mod 2^32
+  seed(rng, timeseed)
+  echo "\n------------------------------------------------------\n"
+  echo "test_pairing_",$GT.Name,"_gt_multiexp_parallel xoshiro512** seed: ", timeseed
+
+  proc test_gt_multiexp_parallel_impl[N](GT: typedesc, rng: var RngState, num_points: array[N, int], gen: RandomGen, iters: int) =
+    let tp = Threadpool.new()
+    defer: tp.shutdown()
+
+    for N in num_points:
+      stdout.write "    "
+      for _ in 0 ..< iters:
+        var elems = newSeq[GT](N)
+        var exponents = newSeq[Fr[GT.Name]](N)
+
+        for i in 0 ..< N:
+          elems[i] = rng.random_gt(GT, gen)
+          exponents[i] = rng.random_elem(Fr[GT.Name], gen)
+
+        var naive: GT
+        naive.setOne()
+        for i in 0 ..< N:
+          var t {.noInit.}: GT
+          t.gtExp_vartime(elems[i], exponents[i])
+          naive *= t
+
+        var mexp: GT
+        tp.multiExp_vartime_parallel(mexp, elems, exponents)
+
+        doAssert bool(naive == mexp)
+
+        stdout.write '.'
+
+      stdout.write '\n'
+
+  suite "Pairing - Parallel MultiExponentiation for 𝔾ₜ " & $GT.Name & " [" & $WordBitWidth & "-bit words]":
+    test "Parallel 𝔾ₜ multi-exponentiation consistency":
+      test_gt_multiexp_parallel_impl(GT, rng, num_points, gen = Long01Sequence, Iters)