Skip to content

Commit

Permalink
Merge pull request #627 from mrava87/feat-fourierradonnumbaimprov
Browse files Browse the repository at this point in the history
feat: speedup fourierradon with engine=cuda
  • Loading branch information
mrava87 authored Dec 11, 2024
2 parents 0f526c1 + 02cfddd commit 9ed06e3
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 12 deletions.
18 changes: 15 additions & 3 deletions pylops/signalprocessing/_fourierradon2d_cuda.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from cmath import exp
from math import pi

import cupy as cp
from numba import cuda

TWO_PI_MINUS = cp.float32(-2.0 * pi)
TWO_PI_PLUS = cp.float32(2.0 * pi)
IMG = cp.complex64(1j)


@cuda.jit
def _radon_inner_2d_kernel(x, y, f, px, h, flim0, flim1, npx, nh):
Expand All @@ -16,7 +20,11 @@ def _radon_inner_2d_kernel(x, y, f, px, h, flim0, flim1, npx, nh):
ih, ifr = cuda.grid(2)
if ih < nh and ifr >= flim0 and ifr <= flim1:
for ipx in range(npx):
y[ih, ifr] += x[ipx, ifr] * exp(-1j * 2 * pi * f[ifr] * px[ipx] * h[ih])
# slow computation of exp(1j * x)
# y[ih, ifr] += x[ipx, ifr] * exp(TWO_PI_MINUS * f[ifr] * px[ipx] * h[ih])
# fast computation of exp(1j * x) - see https://stackoverflow.com/questions/9860711/cucomplex-h-and-exp/9863048#9863048
s, c = cuda.libdevice.sincosf(TWO_PI_MINUS * f[ifr] * px[ipx] * h[ih])
y[ih, ifr] += x[ipx, ifr] * (c + IMG * s)


@cuda.jit
Expand All @@ -31,7 +39,11 @@ def _aradon_inner_2d_kernel(x, y, f, px, h, flim0, flim1, npx, nh):
ipx, ifr = cuda.grid(2)
if ipx < npx and ifr >= flim0 and ifr <= flim1:
for ih in range(nh):
x[ipx, ifr] += y[ih, ifr] * exp(1j * 2 * pi * f[ifr] * px[ipx] * h[ih])
# slow computation of exp(1j * x)
# x[ipx, ifr] += y[ih, ifr] * exp(TWO_PI_I_PLUS * f[ifr] * px[ipx] * h[ih])
# fast computation of exp(1j * x) - see https://stackoverflow.com/questions/9860711/cucomplex-h-and-exp/9863048#9863048
s, c = cuda.libdevice.sincosf(TWO_PI_PLUS * f[ifr] * px[ipx] * h[ih])
x[ipx, ifr] += y[ih, ifr] * (c + IMG * s)


def _radon_inner_2d_cuda(
Expand Down
25 changes: 20 additions & 5 deletions pylops/signalprocessing/_fourierradon3d_cuda.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from cmath import exp
from math import pi

import cupy as cp
from numba import cuda

TWO_PI_MINUS = cp.float32(-2.0 * pi)
TWO_PI_PLUS = cp.float32(2.0 * pi)
IMG = cp.complex64(1j)


@cuda.jit
def _radon_inner_3d_kernel(x, y, f, py, px, hy, hx, flim0, flim1, npy, npx, nhy, nhx):
Expand All @@ -17,9 +21,15 @@ def _radon_inner_3d_kernel(x, y, f, py, px, hy, hx, flim0, flim1, npy, npx, nhy,
if ihy < nhy and ihx < nhx and ifr >= flim0 and ifr <= flim1:
for ipy in range(npy):
for ipx in range(npx):
y[ihy, ihx, ifr] += x[ipy, ipx, ifr] * exp(
-1j * 2 * pi * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx])
# slow computation of exp(1j * x)
# y[ihy, ihx, ifr] += x[ipy, ipx, ifr] * exp(
# TWO_PI_MINUS * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx])
# )
# fast computation of exp(1j * x) - see https://stackoverflow.com/questions/9860711/cucomplex-h-and-exp/9863048#9863048
s, c = cuda.libdevice.sincosf(
TWO_PI_MINUS * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx])
)
y[ihy, ihx, ifr] += x[ipy, ipx, ifr] * (c + IMG * s)


@cuda.jit
Expand All @@ -35,9 +45,14 @@ def _aradon_inner_3d_kernel(x, y, f, py, px, hy, hx, flim0, flim1, npy, npx, nhy
if ipy < npy and ipx < npx and ifr >= flim0 and ifr <= flim1:
for ihy in range(nhy):
for ihx in range(nhx):
x[ipy, ipx, ifr] += y[ihy, ihx, ifr] * exp(
1j * 2 * pi * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx])
# slow computation of exp(1j * x)
# x[ipy, ipx, ifr] += y[ihy, ihx, ifr] * exp(
# TWO_PI_I_PLUS * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx])
# )
s, c = cuda.libdevice.sincosf(
TWO_PI_PLUS * f[ifr] * (py[ipy] * hy[ihy] + px[ipx] * hx[ihx])
)
x[ipy, ipx, ifr] += y[ihy, ihx, ifr] * (c + IMG * s)


def _radon_inner_3d_cuda(
Expand Down
6 changes: 4 additions & 2 deletions pylops/signalprocessing/fourierradon2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
from pylops.utils.typing import DTypeLike, NDArray

jit_message = deps.numba_import("the radon2d module")
cupy_message = deps.cupy_import("the radon2d module")

if jit_message is None:
from ._fourierradon2d_cuda import _aradon_inner_2d_cuda, _radon_inner_2d_cuda
from ._fourierradon2d_numba import _aradon_inner_2d, _radon_inner_2d
if jit_message is None and cupy_message is None:
from ._fourierradon2d_cuda import _aradon_inner_2d_cuda, _radon_inner_2d_cuda

logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.WARNING)

Expand Down Expand Up @@ -167,7 +169,7 @@ def __init__(
self._register_multiplications(engine)

def _register_multiplications(self, engine: str) -> None:
if engine == "numba" and jit_message is None:
if engine == "numba":
self._matvec = self._matvec_numba
self._rmatvec = self._rmatvec_numba
elif engine == "cuda":
Expand Down
6 changes: 4 additions & 2 deletions pylops/signalprocessing/fourierradon3d.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
from pylops.utils.typing import DTypeLike, NDArray

jit_message = deps.numba_import("the radon2d module")
cupy_message = deps.cupy_import("the radon2d module")

if jit_message is None:
from ._fourierradon3d_cuda import _aradon_inner_3d_cuda, _radon_inner_3d_cuda
from ._fourierradon3d_numba import _aradon_inner_3d, _radon_inner_3d
if jit_message is None and cupy_message is None:
from ._fourierradon3d_cuda import _aradon_inner_3d_cuda, _radon_inner_3d_cuda

logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.WARNING)

Expand Down Expand Up @@ -192,7 +194,7 @@ def __init__(
self._register_multiplications(engine)

def _register_multiplications(self, engine: str) -> None:
if engine == "numba" and jit_message is None:
if engine == "numba":
self._matvec = self._matvec_numba
self._rmatvec = self._rmatvec_numba
elif engine == "cuda":
Expand Down

0 comments on commit 9ed06e3

Please sign in to comment.