Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

π”Ύβ‚œ multi-exponentiations #436

Merged
merged 5 commits into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmarks/bench_elliptic_parallel_template.nim
Original file line number Diff line number Diff line change
Expand Up @@ -115,15 +115,15 @@ proc msmParallelBench*[EC](ctx: var BenchMsmContext[EC], numInputs: int, iters:
var startNaive, stopNaive, startMSMbaseline, stopMSMbaseline, startMSMopt, stopMSMopt, startMSMpara, stopMSMpara: MonoTime

if numInputs <= 100000:
startNaive = getMonotime()
# startNaive = getMonotime()
bench("EC scalar muls " & align($numInputs, 10) & " (" & $bits & "-bit coefs, points)", EC, iters):
var tmp: EC
r.setNeutral()
for i in 0 ..< points.len:
tmp.fromAffine(points[i])
tmp.scalarMul(coefs[i])
r += tmp
stopNaive = getMonotime()
# stopNaive = getMonotime()

if numInputs <= 100000:
startNaive = getMonotime()
Expand Down
1 change: 0 additions & 1 deletion benchmarks/bench_gt.nim
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ proc main() =
separator()
staticFor i, 0, AvailableCurves.len:
const curve = AvailableCurves[i]
const bits = Fr[curve].bits()
separator()
mulBench(Fp12[curve], Iters)
sqrBench(Fp12[curve], Iters)
Expand Down
46 changes: 46 additions & 0 deletions benchmarks/bench_gt_multiexp_bls12_381.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy AndrΓ©-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

import
# Internals
constantine/named/algebras,
constantine/math/extension_fields,
# Helpers
./bench_gt_parallel_template

# ############################################################
#
# Benchmark of the π”Ύβ‚œ group of
# Pairing Friendly curves
#
# ############################################################

const Iters = 10000
const AvailableCurves = [
# BN254_Nogami,
# BN254_Snarks,
# BLS12_377,
BLS12_381,
]

const testNumPoints = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]

proc main() =
separator()
staticFor i, 0, AvailableCurves.len:
const curve = AvailableCurves[i]
var ctx = createBenchMultiExpContext(Fp12[curve], testNumPoints)
separator()
for numPoints in testNumPoints:
let batchIters = max(1, Iters div numPoints)
ctx.multiExpParallelBench(numPoints, batchIters)
separator()
separator()

main()
notes()
1 change: 1 addition & 0 deletions benchmarks/bench_gt_multiexp_bls12_381.nim.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
--threads:on
189 changes: 189 additions & 0 deletions benchmarks/bench_gt_parallel_template.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy AndrΓ©-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

# ############################################################
#
# Summary of the performance of a curve
#
# ############################################################

import
# Standard library
std/[monotimes, times],
# Internals
constantine/platforms/abstractions,
constantine/named/algebras,
constantine/math/[arithmetic, extension_fields],
constantine/math/pairings/[
pairings_generic,
gt_exponentiations,
gt_exponentiations_vartime,
gt_multiexp, gt_multiexp_parallel,
],
constantine/threadpool,
# Helpers
helpers/prng_unsafe,
./bench_blueprint

export times, monotimes
export notes
export abstractions
proc separator*() = separator(168)

proc report(op, domain: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
let ns = inNanoseconds((stop-start) div iters)
let throughput = 1e9 / float64(ns)
when SupportsGetTicks:
echo &"{op:<68} {domain:<20} {throughput:>15.3f} ops/s {ns:>9} ns/op {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
else:
echo &"{op:<68} {domain:<20} {throughput:>15.3f} ops/s {ns:>9} ns/op"

macro fixFieldDisplay(T: typedesc): untyped =
# At compile-time, enums are integers and their display is buggy
# we get the Curve ID instead of the curve name.
let instantiated = T.getTypeInst()
var name = $instantiated[1][0] # Fp
name.add "[" & $Algebra(instantiated[1][1].intVal) & "]"
result = newLit name

func fixDisplay(T: typedesc): string =
when T is (Fp or Fp2 or Fp4 or Fp6 or Fp12):
fixFieldDisplay(T)
else:
$T

func fixDisplay(T: Algebra): string =
$T

template bench(op: string, T: typed, iters: int, body: untyped): untyped =
measure(iters, startTime, stopTime, startClk, stopClk, body)
report(op, fixDisplay(T), startTime, stopTime, startClk, stopClk, iters)

func random_gt*(rng: var RngState, F: typedesc): F {.inline, noInit.} =
result = rng.random_unsafe(F)
result.finalExp()

# Multi-exponentiations
# ---------------------------------------------------------------------------

type BenchMultiexpContext*[GT] = object
tp: Threadpool
numInputs: int
exponents: seq[getBigInt(GT.Name(), kScalarField)]
elems: seq[GT]

proc createBenchMultiExpContext*(GT: typedesc, inputSizes: openArray[int]): BenchMultiexpContext[GT] =
result.tp = Threadpool.new()
let maxNumInputs = inputSizes.max()

const bits = Fr[GT.Name].bits()

result.numInputs = maxNumInputs
result.elems = newSeq[GT](maxNumInputs)
result.exponents = newSeq[BigInt[bits]](maxNumInputs)

proc genElemExponentPairsChunk[GT](rngSeed: uint64, start, len: int, elems: ptr GT, exponents: ptr BigInt[bits]) {.nimcall.} =
let elems = cast[ptr UncheckedArray[GT]](elems)
let exponents = cast[ptr UncheckedArray[BigInt[bits]]](exponents)

# RNGs are not threadsafe, create a threadlocal one seeded from the global RNG
var threadRng: RngState
threadRng.seed(rngSeed)

for i in start ..< start + len:
elems[i] = threadRng.random_gt(GT)
exponents[i] = threadRng.random_unsafe(BigInt[bits])

let chunks = balancedChunksPrioNumber(0, maxNumInputs, result.tp.numThreads)

stdout.write &"Generating {maxNumInputs} (elems, exponents) pairs ... "
stdout.flushFile()

let start = getMonotime()

syncScope:
for (id, start, size) in items(chunks):
result.tp.spawn genElemExponentPairsChunk(rng.next(), start, size, result.elems[0].addr, result.exponents[0].addr)

# Even if child threads are sleeping, it seems like perf is lower when there are threads around
# maybe because the kernel has more overhead or time quantum to keep track off so shut them down.
result.tp.shutdown()

let stop = getMonotime()
stdout.write &"in {float64(inNanoSeconds(stop-start)) / 1e6:6.3f} ms\n"

proc multiExpParallelBench*[GT](ctx: var BenchMultiExpContext[GT], numInputs: int, iters: int) =
const bits = Fr[GT.Name].bits()

template elems: untyped = ctx.elems.toOpenArray(0, numInputs-1)
template exponents: untyped = ctx.exponents.toOpenArray(0, numInputs-1)


var r{.noInit.}: GT
var startNaive, stopNaive, startMultiExpBaseline, stopMultiExpBaseline: MonoTime
var startMultiExpOpt, stopMultiExpOpt, startMultiExpPara, stopMultiExpPara: MonoTime

if numInputs <= 100000:
# startNaive = getMonotime()
bench("π”Ύβ‚œ exponentiations " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
var tmp: GT
r.setOne()
for i in 0 ..< elems.len:
tmp.gtExp(elems[i], exponents[i])
r *= tmp
# stopNaive = getMonotime()

if numInputs <= 100000:
startNaive = getMonotime()
bench("π”Ύβ‚œ exponentiations vartime " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
var tmp: GT
r.setOne()
for i in 0 ..< elems.len:
tmp.gtExp_vartime(elems[i], exponents[i])
r *= tmp
stopNaive = getMonotime()

if numInputs <= 100000:
startMultiExpBaseline = getMonotime()
bench("π”Ύβ‚œ multi-exponentiations baseline " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
r.multiExp_reference_vartime(elems, exponents)
stopMultiExpBaseline = getMonotime()

block:
startMultiExpOpt = getMonotime()
bench("π”Ύβ‚œ multi-exponentiations optimized " & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
r.multiExp_vartime(elems, exponents)
stopMultiExpOpt = getMonotime()

block:
ctx.tp = Threadpool.new()

startMultiExpPara = getMonotime()
bench("π”Ύβ‚œ multi-exponentiations" & align($ctx.tp.numThreads & " threads", 11) & align($numInputs, 10) & " (" & $bits & "-bit exponents)", GT, iters):
ctx.tp.multiExp_vartime_parallel(r, elems, exponents)
stopMultiExpPara = getMonotime()

ctx.tp.shutdown()

let perfNaive = inNanoseconds((stopNaive-startNaive) div iters)
let perfMultiExpBaseline = inNanoseconds((stopMultiExpBaseline-startMultiExpBaseline) div iters)
let perfMultiExpOpt = inNanoseconds((stopMultiExpOpt-startMultiExpOpt) div iters)
let perfMultiExpPara = inNanoseconds((stopMultiExpPara-startMultiExpPara) div iters)

if numInputs <= 100000:
let speedupBaseline = float(perfNaive) / float(perfMultiExpBaseline)
echo &"Speedup ratio baseline over naive linear combination: {speedupBaseline:>6.3f}x"

let speedupOpt = float(perfNaive) / float(perfMultiExpOpt)
echo &"Speedup ratio optimized over naive linear combination: {speedupOpt:>6.3f}x"

let speedupOptBaseline = float(perfMultiExpBaseline) / float(perfMultiExpOpt)
echo &"Speedup ratio optimized over baseline linear combination: {speedupOptBaseline:>6.3f}x"

let speedupParaOpt = float(perfMultiExpOpt) / float(perfMultiExpPara)
echo &"Speedup ratio parallel over optimized linear combination: {speedupParaOpt:>6.3f}x"
1 change: 0 additions & 1 deletion benchmarks/bench_gt_template.nim
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import
helpers/prng_unsafe,
./bench_blueprint


export notes
export abstractions
proc separator*() = separator(168)
Expand Down
9 changes: 9 additions & 0 deletions constantine.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,8 @@ const testDescMultithreadedCrypto: seq[string] = @[
"tests/parallel/t_ec_shortw_prj_g1_batch_add_parallel.nim",
"tests/parallel/t_ec_shortw_jac_g1_msm_parallel.nim",
"tests/parallel/t_ec_shortw_prj_g1_msm_parallel.nim",
"tests/parallel/t_ec_twedwards_prj_msm_parallel.nim",
"tests/parallel/t_pairing_bls12_381_gt_multiexp_parallel.nim",
]

const benchDesc = [
Expand All @@ -661,6 +663,7 @@ const benchDesc = [
"bench_pairing_bn254_nogami",
"bench_pairing_bn254_snarks",
"bench_gt",
"bench_gt_multiexp_bls12_381",
"bench_summary_bls12_377",
"bench_summary_bls12_381",
"bench_summary_bn254_nogami",
Expand Down Expand Up @@ -1043,6 +1046,12 @@ task bench_ec_g2_scalar_mul, "Run benchmark on Elliptic Curve group 𝔾2 (Multi
task bench_gt, "Run π”Ύβ‚œ benchmarks - CC compiler":
runBench("bench_gt")

# π”Ύβ‚œ - multi-exponentiation
# ------------------------------------------

task bench_gt_multiexp_bls12_381, "Run π”Ύβ‚œ multiexponentiation benchmarks for BLS12-381 - CC compiler":
runBench("bench_gt_multiexp_bls12_381")

# Pairings
# ------------------------------------------

Expand Down
Loading
Loading