A NVIDIA CUDA Foreign Function Interface (FFI) library.
- Run CUDA code in Python from a string or a file
- Follows common Foreign Function Interface (FFI) design pattern like ctypes or node-ffi
- Automatic argument conversion and data transfer
- Supported argument types:
int
,array
,bytes
,bytearray
,string
,numpy
, any object implementing the buffer protocol - Extensible arugument typing through
CudaDataType.register()
- Supported argument types:
- Argument type checking to prevent errors
- Chaining of CUDA kernels and memory transfers in a graph for highest efficiency
from cudaffi import CudaModule
mod = CudaModule("""
__global__ void tryme(char *str) {
printf("string is: %s\n", str);
}
""")
mod.tryme("this is a test")
# string is: this is a test
from cudaffi import CudaModule
mod = CudaModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
# arg type checking and memory transfer directions
mod.multiply_them.arg_types = [("output", "numpy"), ("input", "numpy"), ("input", "numpy")]
a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)
out = numpy.zeros_like(a)
# type checking
mod.multiply_them(out, a, [1,2,3,4])
# TypeError: expected numpy for arg2
# manually set block and grid size
mod.multiply_them(out, a, b, block=(400,1,1), grid=(1,1,1))
# use 'autoout' and specify size to automatically allocate and convert return results in the desired format
mod.multiply_them.arg_types = [("output", "numpy": lambda args: arg1[1].size), ("input", "numpy"), ("input", "numpy")]
# validate that the input arrays have the same shape
mod.multiply_them.validate_args = lambda args: args[1].shape == args[2].shape
# default CUDA block size is the size of the input array
mod.multiply_them.default_block = lambda args: (args[1].size, 1, 1)
# return type created automatically from autoout... if multiple autoouts are defined, a tuple of results is returned
out = mod.multiply_them(a, b)
from cudaffi import CudaModule, cuda_plan
import numpy as np
mod = CudaModule.load_file("test_graph.cu")
mod.start_ker.arg_types = [("input", "numpy"), ("autoout", "bytes"), ("autoout", "int32")]
mod.middle_ker.arg_types = [("input", "bytes"), ("input", "int32"), ("autoout", "bytes"), ("autoout", "int16")]
mod.end_ker.arg_types = [("input", "bytes"), ("input", "int16"), ("autoout", "int16")]
# creates a graph with all the memcpy, memalloc, and kernel nodes and their dependencies
@cuda_plan
def my_plan(arr: np.nparray[Any, Any]) -> int:
b, sz = mod.start_ker(arr)
b2, sz2 = mod.middle_ker(b, sz)
return mod.end_ker(b2, sz2)
from cudaffi import CudaDevice, CudaStream, CudaContext
CudaDevice.set_default(0)
print("Device:", d.name, d.compute_capability, d.driver_version)
s = CudaStream()
CudaStream.set_default(s)
ctx = CudaContext()
CudaContext.set_default(ctx)
# ...
d = CudaDevice.get_default()
s = CudaStream.get_default()
ctx = CudaContext.get_default()
See cudaffi/datatypes/*.py
for examples