Skip to content

Commit

Permalink
[tests] add test case for mul
Browse files Browse the repository at this point in the history
I mainly added to debug a bug I saw
  • Loading branch information
Vindaar committed Sep 19, 2024
1 parent 7b5cb3d commit 0d37690
Showing 1 changed file with 124 additions and 0 deletions.
124 changes: 124 additions & 0 deletions tests/gpu/t_mul.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

import
# Internal
constantine/named/algebras,
constantine/math/io/[io_bigints, io_fields],
constantine/math/arithmetic,
constantine/platforms/abstractions,
constantine/platforms/llvm/llvm,
constantine/math_compiler/[ir, pub_fields, codegen_nvidia]

proc execMul*[T](jitFn: CUfunction, r: var T; a, b: T) =
## Execute a binary operation in the form r <- op(a) with `c` a condition
## on Nvidia GPU
# The execution wrapper provided are mostly for testing and debugging low-level kernels
# that serve as building blocks, like field addition or multiplication.
# They aren't parallelizable so we are not concern about the grid and block size.
# We also aren't concerned about the cuda stream when testing.
#
# This is not the case for production kernels (multi-scalar-multiplication, FFT)
# as we want to execute kernels asynchronously then merge results which might require multiple streams.

static: doAssert cpuEndian == littleEndian, block:
# From https://developer.nvidia.com/cuda-downloads?target_os=Linux
# Supported architectures for Cuda are:
# x86-64, PowerPC 64 little-endian, ARM64 (aarch64)
# which are all little-endian at word-level.
#
# Due to limbs being also stored in little-endian, on little-endian host
# the CPU and GPU will have the same binary representation
# whether we use 32-bit or 64-bit words, so naive memcpy can be used for parameter passing.

"Most CPUs (x86-64, ARM) are little-endian, as are Nvidia GPUs, which allows naive copying of parameters.\n" &
"Your architecture '" & $hostCPU & "' is big-endian and GPU offloading is unsupported on it."

# We assume that all arguments are passed by reference in the Cuda kernel, hence the need for GPU alloc.

var rGPU, aGPU, bGPU: CUdeviceptr
check cuMemAlloc(rGPU, csize_t sizeof(r))
check cuMemAlloc(aGPU, csize_t sizeof(a))
check cuMemAlloc(bGPU, csize_t sizeof(b))

check cuMemcpyHtoD(aGPU, a.addr, csize_t sizeof(a))
check cuMemcpyHtoD(bGPU, b.addr, csize_t sizeof(b))

let params = [pointer(rGPU.addr), pointer(aGPU.addr), pointer(bGPU.addr)]

check cuLaunchKernel(
jitFn,
1, 1, 1, # grid(x, y, z)
1, 1, 1, # block(x, y, z)
sharedMemBytes = 0,
CUstream(nil),
params[0].unsafeAddr, nil)

check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r))

check cuMemFree(rGPU)
check cuMemFree(aGPU)
check cuMemFree(bGPU)

# Init LLVM
# -------------------------
initializeFullNVPTXTarget()

# Init GPU
# -------------------------
let cudaDevice = cudaDeviceInit()
var sm: tuple[major, minor: int32]
check cuDeviceGetAttribute(sm.major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cudaDevice)
check cuDeviceGetAttribute(sm.minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cudaDevice)

proc testName[Name: static Algebra](field: type FF[Name], wordSize: int, a, b: FF[Name]) =
# Codegen
# -------------------------
let name = if field is Fp: $Name & "_fp"
else: $Name & "_fr"
let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & name & $wordSize))
let fd = asy.ctx.configureField(
name, field.bits(),
field.getModulus().toHex(),
v = 1, w = wordSize
)

asy.definePrimitives(fd)

let kernName = asy.genFpMul(fd)
let ptx = asy.codegenNvidiaPTX(sm)

# GPU exec
# -------------------------
var cuCtx: CUcontext
var cuMod: CUmodule
check cuCtxCreate(cuCtx, 0, cudaDevice)
check cuModuleLoadData(cuMod, ptx)
defer:
check cuMod.cuModuleUnload()
check cuCtx.cuCtxDestroy()

let kernel = cuMod.getCudaKernel(kernName)

block Logic:
# For CPU:
var rCPU: field
rCPU = a * b

# For GPU:
var rGPU: field
kernel.execMul(rGPU, a, b)

echo "CPU = ", rCPU.toHex()
echo "GPU = ", rGPU.toHex()
doAssert bool(rCPU == rGPU)

let a = Fp[BN254_Snarks].fromUInt(1'u64)
let b = Fp[BN254_Snarks].fromHex("0x2beb0d0d6115007676f30bcc462fe814bf81198848f139621a3e9fa454fe8e6a")

testName(Fp[BN254_Snarks], 64, a, b)

0 comments on commit 0d37690

Please sign in to comment.