Skip to content

Commit

Permalink
π”Ύβ‚œ multi-exponentiations (#436)
Browse files Browse the repository at this point in the history
* feat(gt-multiexp): add baseline multi-exponentiation on π”Ύβ‚œ based on BDLO12

* feat(gt-multiexp): add optimized multi-exponentiation in π”Ύβ‚œ

* feat(gt-multiexp): add parallel multi-exponentiation in π”Ύβ‚œ

* feat(gt-multiexp): enable parallel tests in nimble

* magic workaround for nim-lang/Nim#23853
  • Loading branch information
mratsim authored Jul 19, 2024
1 parent 4dea093 commit 9268502
Show file tree
Hide file tree
Showing 32 changed files with 1,323 additions and 191 deletions.
4 changes: 2 additions & 2 deletions benchmarks/bench_elliptic_parallel_template.nim
Original file line number Diff line number Diff line change
Expand Up @@ -115,15 +115,15 @@ proc msmParallelBench*[EC](ctx: var BenchMsmContext[EC], numInputs: int, iters:
var startNaive, stopNaive, startMSMbaseline, stopMSMbaseline, startMSMopt, stopMSMopt, startMSMpara, stopMSMpara: MonoTime

if numInputs <= 100000:
startNaive = getMonotime()
# startNaive = getMonotime()
bench("EC scalar muls " & align($numInputs, 10) & " (" & $bits & "-bit coefs, points)", EC, iters):
var tmp: EC
r.setNeutral()
for i in 0 ..< points.len:
tmp.fromAffine(points[i])
tmp.scalarMul(coefs[i])
r += tmp
stopNaive = getMonotime()
# stopNaive = getMonotime()

if numInputs <= 100000:
startNaive = getMonotime()
Expand Down
1 change: 0 additions & 1 deletion benchmarks/bench_gt.nim
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ proc main() =
separator()
staticFor i, 0, AvailableCurves.len:
const curve = AvailableCurves[i]
const bits = Fr[curve].bits()
separator()
mulBench(Fp12[curve], Iters)
sqrBench(Fp12[curve], Iters)
Expand Down
46 changes: 46 additions & 0 deletions benchmarks/bench_gt_multiexp_bls12_381.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy AndrΓ©-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

import
# Internals
constantine/named/algebras,
constantine/math/extension_fields,
# Helpers
./bench_gt_parallel_template

# ############################################################
#
# Benchmark of the π”Ύβ‚œ group of
# Pairing Friendly curves
#
# ############################################################

const Iters = 10000
const AvailableCurves = [
# BN254_Nogami,
# BN254_Snarks,
# BLS12_377,
BLS12_381,
]

const testNumPoints = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]

proc main() =
separator()
staticFor i, 0, AvailableCurves.len:
const curve = AvailableCurves[i]
var ctx = createBenchMultiExpContext(Fp12[curve], testNumPoints)
separator()
for numPoints in testNumPoints:
let batchIters = max(1, Iters div numPoints)
ctx.multiExpParallelBench(numPoints, batchIters)
separator()
separator()

main()
notes()
1 change: 1 addition & 0 deletions benchmarks/bench_gt_multiexp_bls12_381.nim.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
--threads:on
189 changes: 189 additions & 0 deletions benchmarks/bench_gt_parallel_template.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy AndrΓ©-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

# ############################################################
#
# Summary of the performance of a curve
#
# ############################################################

import
# Standard library
std/[monotimes, times],
# Internals
constantine/platforms/abstractions,
constantine/named/algebras,
constantine/math/[arithmetic, extension_fields],
constantine/math/pairings/[
pairings_generic,
gt_exponentiations,
gt_exponentiations_vartime,
gt_multiexp, gt_multiexp_parallel,
],
constantine/threadpool,
# Helpers
helpers/prng_unsafe,
./bench_blueprint

export times, monotimes
export notes
export abstractions
proc separator*() = separator(168)

proc report(op, domain: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
let ns = inNanoseconds((stop-start) div iters)
let throughput = 1e9 / float64(ns)
when SupportsGetTicks:
echo &"{op:<68} {domain:<20} {throughput:>15.3f} ops/s {ns:>9} ns/op {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
else:
echo &"{op:<68} {domain:<20} {throughput:>15.3f} ops/s {ns:>9} ns/op"

macro fixFieldDisplay(T: typedesc): untyped =
# At compile-time, enums are integers and their display is buggy
# we get the Curve ID instead of the curve name.
let instantiated = T.getTypeInst()
var name = $instantiated[1][0] # Fp
name.add "[" & $Algebra(instantiated[1][1].intVal) & "]"
result = newLit name

func fixDisplay(T: typedesc): string =
when T is (Fp or Fp2 or Fp4 or Fp6 or Fp12):
fixFieldDisplay(T)
else:
$T

func fixDisplay(T: Algebra): string =
$T

template bench(op: string, T: typed, iters: int, body: untyped): untyped =
measure(iters, startTime, stopTime, startClk, stopClk, body)
report(op, fixDisplay(T), startTime, stopTime, startClk, stopClk, iters)

func random_gt*(rng: var RngState, F: typedesc): F {.inline, noInit.} =
result = rng.random_unsafe(F)
result.finalExp()

# Multi-exponentiations
# ---------------------------------------------------------------------------

type BenchMultiexpContext*[GT] = object
tp: Threadpool
numInputs: int
exponents: seq[getBigInt(GT.Name(), kScalarField)]
elems: seq[GT]

proc createBenchMultiExpContext*(GT: typedesc, inputSizes: openArray[int]): BenchMultiexpContext[GT] =
result.tp = Threadpool.new()
let maxNumInputs = inputSizes.max()

const bits = Fr[GT.Name].bits()

result.numInputs = maxNumInputs
result.elems = newSeq[GT](maxNumInputs)
result.exponents = newSeq[BigInt[bits]](maxNumInputs)

proc genElemExponentPairsChunk[GT](rngSeed: uint64, start, len: int, elems: ptr GT, exponents: ptr BigInt[bits]) {.nimcall.} =
let elems = cast[ptr UncheckedArray[GT]](elems)
let exponents = cast[ptr UncheckedArray[BigInt[bits]]](exponents)

# RNGs are not threadsafe, create a threadlocal one seeded from the global RNG
var threadRng: RngState
threadRng.seed(rngSeed)

for i in start ..< start + len:
elems[i] = threadRng.random_gt(GT)
exponents[i] = threadRng.random_unsafe(BigInt[bits])

let chunks = balancedChunksPrioNumber(0, maxNumInputs, result.tp.numThreads)

stdout.write &"Generating {maxNumInputs} (elems, exponents) pairs ... "
stdout.flushFile()

let start = getMonotime()

syncScope:
for (id, start, size) in items(chunks):
result.tp.spawn genElemExponentPairsChunk(rng.next(), start, size, result.elems[0].addr, result.exponents[0].addr)

# Even if child threads are sleeping, it seems like perf is lower when there are threads around
# maybe because the kernel has more overhead or time quantum to keep track off so shut them down.
result.tp.shutdown()

let stop = getMonotime()
stdout.write &"in {float64(inNanoSeconds(stop-start)) / 1e6:6.3f} ms\n"

proc multiExpParallelBench*[GT](ctx: var BenchMultiExpContext[GT], numInputs: int, iters: int) =
const bits = Fr[GT.Name].bits()

template elems: untyped = ctx.elems.toOpenArray(0, numInputs-1)
template exponents: untyped = ctx.exponents.toOpenArray(0, numInputs-1)


var r{.noInit.}: GT
var startNaive, stopNaive, startMultiExpBaseline, stopMultiExpBaseline: MonoTime
var startMultiExpOpt, stopMultiExpOpt, startMultiExpPara, stopMultiExpPara: MonoTime

if numInputs <= 100000:
# startNaive = getMonotime()
bench("π”Ύβ‚œ exponentiations " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
var tmp: GT
r.setOne()
for i in 0 ..< elems.len:
tmp.gtExp(elems[i], exponents[i])
r *= tmp
# stopNaive = getMonotime()

if numInputs <= 100000:
startNaive = getMonotime()
bench("π”Ύβ‚œ exponentiations vartime " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
var tmp: GT
r.setOne()
for i in 0 ..< elems.len:
tmp.gtExp_vartime(elems[i], exponents[i])
r *= tmp
stopNaive = getMonotime()

if numInputs <= 100000:
startMultiExpBaseline = getMonotime()
bench("π”Ύβ‚œ multi-exponentiations baseline " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
r.multiExp_reference_vartime(elems, exponents)
stopMultiExpBaseline = getMonotime()

block:
startMultiExpOpt = getMonotime()
bench("π”Ύβ‚œ multi-exponentiations optimized " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
r.multiExp_vartime(elems, exponents)
stopMultiExpOpt = getMonotime()

block:
ctx.tp = Threadpool.new()

startMultiExpPara = getMonotime()
bench("π”Ύβ‚œ multi-exponentiations" & align($ctx.tp.numThreads & " threads", 11) & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
ctx.tp.multiExp_vartime_parallel(r, elems, exponents)
stopMultiExpPara = getMonotime()

ctx.tp.shutdown()

let perfNaive = inNanoseconds((stopNaive-startNaive) div iters)
let perfMultiExpBaseline = inNanoseconds((stopMultiExpBaseline-startMultiExpBaseline) div iters)
let perfMultiExpOpt = inNanoseconds((stopMultiExpOpt-startMultiExpOpt) div iters)
let perfMultiExpPara = inNanoseconds((stopMultiExpPara-startMultiExpPara) div iters)

if numInputs <= 100000:
let speedupBaseline = float(perfNaive) / float(perfMultiExpBaseline)
echo &"Speedup ratio baseline over naive linear combination: {speedupBaseline:>6.3f}x"

let speedupOpt = float(perfNaive) / float(perfMultiExpOpt)
echo &"Speedup ratio optimized over naive linear combination: {speedupOpt:>6.3f}x"

let speedupOptBaseline = float(perfMultiExpBaseline) / float(perfMultiExpOpt)
echo &"Speedup ratio optimized over baseline linear combination: {speedupOptBaseline:>6.3f}x"

let speedupParaOpt = float(perfMultiExpOpt) / float(perfMultiExpPara)
echo &"Speedup ratio parallel over optimized linear combination: {speedupParaOpt:>6.3f}x"
1 change: 0 additions & 1 deletion benchmarks/bench_gt_template.nim
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import
helpers/prng_unsafe,
./bench_blueprint


export notes
export abstractions
proc separator*() = separator(168)
Expand Down
9 changes: 9 additions & 0 deletions constantine.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,8 @@ const testDescMultithreadedCrypto: seq[string] = @[
"tests/parallel/t_ec_shortw_prj_g1_batch_add_parallel.nim",
"tests/parallel/t_ec_shortw_jac_g1_msm_parallel.nim",
"tests/parallel/t_ec_shortw_prj_g1_msm_parallel.nim",
"tests/parallel/t_ec_twedwards_prj_msm_parallel.nim",
"tests/parallel/t_pairing_bls12_381_gt_multiexp_parallel.nim",
]

const benchDesc = [
Expand All @@ -661,6 +663,7 @@ const benchDesc = [
"bench_pairing_bn254_nogami",
"bench_pairing_bn254_snarks",
"bench_gt",
"bench_gt_multiexp_bls12_381",
"bench_summary_bls12_377",
"bench_summary_bls12_381",
"bench_summary_bn254_nogami",
Expand Down Expand Up @@ -1043,6 +1046,12 @@ task bench_ec_g2_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾2 (Multi
task bench_gt, "Run π”Ύβ‚œ benchmarks - CC compiler":
runBench("bench_gt")

# π”Ύβ‚œ - multi-exponentiation
# ------------------------------------------

task bench_gt_multiexp_bls12_381, "Run π”Ύβ‚œ multiexponentiation benchmarks for BLS12-381 - CC compiler":
runBench("bench_gt_multiexp_bls12_381")

# Pairings
# ------------------------------------------

Expand Down
Loading

0 comments on commit 9268502

Please sign in to comment.