Skip to content

Commit

Permalink
Array4: __cuda_array_interface__ v2
Browse files Browse the repository at this point in the history
Start implementing the `__cuda_array_interface__` for zero-copy
data exchange on Nvidia CUDA GPUs.
  • Loading branch information
ax3l committed Mar 26, 2022
1 parent e430ab8 commit 44eb90c
Show file tree
Hide file tree
Showing 3 changed files with 188 additions and 70 deletions.
119 changes: 81 additions & 38 deletions src/Base/Array4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,61 @@ namespace py = pybind11;
using namespace amrex;


namespace
{
/** CPU: __array_interface__ v3
*
* https://numpy.org/doc/stable/reference/arrays.interface.html
*/
template<typename T>
py::dict
array_interface(Array4<T> const & a4)
{
auto d = py::dict();
auto const len = length(a4);
// F->C index conversion here
// p[(i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride+n*nstride];
// Buffer dimensions: zero-size shall not skip dimension
auto shape = py::make_tuple(
a4.ncomp,
len.z <= 0 ? 1 : len.z,
len.y <= 0 ? 1 : len.y,
len.x <= 0 ? 1 : len.x // fastest varying index
);
// buffer protocol strides are in bytes, AMReX strides are elements
auto const strides = py::make_tuple(
sizeof(T) * a4.nstride,
sizeof(T) * a4.kstride,
sizeof(T) * a4.jstride,
sizeof(T) // fastest varying index
);
bool const read_only = false;
d["data"] = py::make_tuple(std::intptr_t(a4.dataPtr()), read_only);
// note: if we want to keep the same global indexing with non-zero
// box small_end as in AMReX, then we can explore playing with
// this offset as well
//d["offset"] = 0; // default
//d["mask"] = py::none(); // default

d["shape"] = shape;
// we could also set this after checking the strides are C-style contiguous:
//if (is_contiguous<T>(shape, strides))
// d["strides"] = py::none(); // C-style contiguous
//else
d["strides"] = strides;

// type description
// for more complicated types, e.g., tuples/structs
//d["descr"] = ...;
// we currently only need this
d["typestr"] = py::format_descriptor<T>::format();

d["version"] = 3;
return d;
}
}


template< typename T >
void make_Array4(py::module &m, std::string typestr)
{
Expand Down Expand Up @@ -85,56 +140,44 @@ void make_Array4(py::module &m, std::string typestr)
return a4;
}))


// CPU: __array_interface__ v3
// https://numpy.org/doc/stable/reference/arrays.interface.html
.def_property_readonly("__array_interface__", [](Array4<T> const & a4) {
auto d = py::dict();
auto const len = length(a4);
// F->C index conversion here
// p[(i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride+n*nstride];
// Buffer dimensions: zero-size shall not skip dimension
auto shape = py::make_tuple(
a4.ncomp,
len.z <= 0 ? 1 : len.z,
len.y <= 0 ? 1 : len.y,
len.x <= 0 ? 1 : len.x // fastest varying index
);
// buffer protocol strides are in bytes, AMReX strides are elements
auto const strides = py::make_tuple(
sizeof(T) * a4.nstride,
sizeof(T) * a4.kstride,
sizeof(T) * a4.jstride,
sizeof(T) // fastest varying index
);
bool const read_only = false;
d["data"] = py::make_tuple(std::intptr_t(a4.dataPtr()), read_only);
// note: if we want to keep the same global indexing with non-zero
// box small_end as in AMReX, then we can explore playing with
// this offset as well
//d["offset"] = 0; // default
//d["mask"] = py::none(); // default

d["shape"] = shape;
// we could also set this after checking the strides are C-style contiguous:
//if (is_contiguous<T>(shape, strides))
// d["strides"] = py::none(); // C-style contiguous
//else
d["strides"] = strides;

d["typestr"] = py::format_descriptor<T>::format();
d["version"] = 3;
return d;
return array_interface(a4);
})

// CPU: __array_function__ interface (TODO)
//
// NEP 18 — A dispatch mechanism for NumPy's high level array functions.
// https://numpy.org/neps/nep-0018-array-function-protocol.html
// This enables code using NumPy to be directly operated on Array4 arrays.
// __array_function__ feature requires NumPy 1.16 or later.


// TODO: __cuda_array_interface__
// Nvidia GPUs: __cuda_array_interface__ v2
// https://numba.readthedocs.io/en/latest/cuda/cuda_array_interface.html
.def_property_readonly("__cuda_array_interface__", [](Array4<T> const & a4) {
auto d = array_interface(a4);

// data:
// Because the user of the interface may or may not be in the same context, the most common case is to use cuPointerGetAttribute with CU_POINTER_ATTRIBUTE_DEVICE_POINTER in the CUDA driver API (or the equivalent CUDA Runtime API) to retrieve a device pointer that is usable in the currently active context.
// TODO For zero-size arrays, use 0 here.

// ... TODO: wasn't there some stream or device info?

d["version"] = 2;
return d;
})


// TODO: __dlpack__
// TODO: __dlpack__ __dlpack_device__
// DLPack protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.)
// https://dmlc.github.io/dlpack/latest/
// https://data-apis.org/array-api/latest/design_topics/data_interchange.html
// https://github.com/data-apis/consortium-feedback/issues/1
// https://github.com/dmlc/dlpack/blob/master/include/dlpack/dlpack.h
// https://docs.cupy.dev/en/stable/user_guide/interoperability.html#dlpack-data-exchange-protocol


.def("contains", &Array4<T>::contains)
Expand Down
91 changes: 59 additions & 32 deletions tests/test_array4.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
import amrex


def test_array4_empty():
empty = amrex.Array4_double()

Expand All @@ -17,6 +18,7 @@ def test_array4_empty():
assert(emptyc.size == 0)
assert(emptyc.nComp == 0)


def test_array4():
# from numpy (also a non-owning view)
x = np.ones((2, 3, 4,))
Expand Down Expand Up @@ -60,35 +62,60 @@ def test_array4():
x[1, 1, 1] = 44
assert(v_carr2np[0, 1, 1, 1] == 44)

# from cupy

# to numpy

# to cupy

return

# Check indexing
assert(obj[0] == 1)
assert(obj[1] == 2)
assert(obj[2] == 3)
assert(obj[-1] == 3)
assert(obj[-2] == 2)
assert(obj[-3] == 1)
with pytest.raises(IndexError):
obj[-4]
with pytest.raises(IndexError):
obj[3]

# Check assignment
obj[0] = 2
obj[1] = 3
obj[2] = 4
assert(obj[0] == 2)
assert(obj[1] == 3)
assert(obj[2] == 4)

#def test_iv_conversions():
# obj = amrex.IntVect.max_vector().numpy()
# assert(isinstance(obj, np.ndarray))
# assert(obj.dtype == np.int32)

@pytest.mark.skipif(amrex.Config.gpu_backend != "CUDA",
reason="Requires AMReX_GPU_BACKEND=CUDA")
def test_array4_numba():
# https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html
import numba

# numba -> AMReX Array4
x = np.ones((2, 3, 4,)) # type: numpy.ndarray

# host-to-device copy
x_numba = numba.cuda.to_device(x) # type: numba.cuda.cudadrv.devicearray.DeviceNDArray
#x_cupy = cupy.asarray(x_numba) # type: cupy.ndarray
x_arr = amrex.Array4_double(x_numba) # type: amrex.Array4_double

assert(x_arr.__cuda_array_interface__['data'][0] == x_numba.__cuda_array_interface__['data'][0])

# AMReX -> numba
#arr_numba = cuda.as_cuda_array(arr4)
# ... or as MultiFab test
# TODO


@pytest.mark.skipif(amrex.Config.gpu_backend != "CUDA",
reason="Requires AMReX_GPU_BACKEND=CUDA")
def test_array4_cupy():
# https://docs.cupy.dev/en/stable/user_guide/interoperability.html
import cupy as cp

# cupy -> AMReX Array4
x = np.ones((2, 3, 4,)) # TODO: merge into next line and create on device?
x_cupy = cp.asarray(x) # type: cupy.ndarray
print(f"x_cupy={x_cupy}")
print(x_cupy.__cuda_array_interface__)

# cupy -> AMReX array4
x_arr = amrex.Array4_double(x_cupy) # type: amrex.Array4_double
print(f"x_arr={x_arr}")
print(x_arr.__cuda_array_interface__)

assert(x_arr.__cuda_array_interface__['data'][0] == x_cupy.__cuda_array_interface__['data'][0])

# AMReX -> cupy
#arr_numba = cuda.as_cuda_array(arr4)
# ... or as MultiFab test
# TODO


@pytest.mark.skipif(amrex.Config.gpu_backend != "CUDA",
reason="Requires AMReX_GPU_BACKEND=CUDA")
def test_array4_pytorch():
# https://docs.cupy.dev/en/stable/user_guide/interoperability.html#pytorch
#arr_torch = torch.as_tensor(arr, device='cuda')
#assert(arr_torch.__cuda_array_interface__['data'][0] == arr.__cuda_array_interface__['data'][0])
# TODO

pass
48 changes: 48 additions & 0 deletions tests/test_multifab.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,51 @@ def test_mfab_ops(boxarr, distmap, nghost):
np.testing.assert_allclose(dst.min(0), 150.0)
np.testing.assert_allclose(dst.max(0), 150.0)


@pytest.mark.skipif(amrex.Config.gpu_backend != "CUDA",
reason="Requires AMReX_GPU_BACKEND=CUDA")
def test_mfab_ops_cuda_numba():
# https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html
import numba

# AMReX -> numba
#arr_numba = cuda.as_cuda_array(arr4)
# TODO


@pytest.mark.skipif(amrex.Config.gpu_backend != "CUDA",
reason="Requires AMReX_GPU_BACKEND=CUDA")
def test_mfab_ops_cuda_cupy():
# https://docs.cupy.dev/en/stable/user_guide/interoperability.html
import cupy as cp

# AMReX -> cupy
#arr_numba = cuda.as_cuda_array(arr4)
# TODO


@pytest.mark.skipif(amrex.Config.gpu_backend != "CUDA",
reason="Requires AMReX_GPU_BACKEND=CUDA")
def test_mfab_ops_cuda_pytorch():
# https://docs.cupy.dev/en/stable/user_guide/interoperability.html#pytorch
import torch

# AMReX -> pytorch
#arr_torch = torch.as_tensor(arr, device='cuda')
#assert(arr_torch.__cuda_array_interface__['data'][0] == arr.__cuda_array_interface__['data'][0])
# TODO


@pytest.mark.skipif(amrex.Config.gpu_backend != "CUDA",
reason="Requires AMReX_GPU_BACKEND=CUDA")
def test_mfab_ops_cuda_cuml():
# https://github.com/rapidsai/cuml
# https://github.com/rapidsai/cudf
# maybe better for particles as a dataframe test
import cudf
import cuml

# AMReX -> RAPIDSAI cuML
#arr_cuml = ...
#assert(arr_cuml.__cuda_array_interface__['data'][0] == arr.__cuda_array_interface__['data'][0])
# TODO

0 comments on commit 44eb90c

Please sign in to comment.