Skip to content

Commit

Permalink
Rename local_mem to cu_dynamic_local_mem
Browse files Browse the repository at this point in the history
  • Loading branch information
fjarri committed Jul 27, 2024
1 parent dbf2ea1 commit ec31a91
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 13 deletions.
2 changes: 1 addition & 1 deletion grunnur/adapter_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,6 @@ def __call__(
self,
queue_adapter: QueueAdapter,
*args: BufferAdapter | numpy.generic,
local_mem: int = 0,
cu_dynamic_local_mem: int = 0,
) -> Any:
pass
4 changes: 2 additions & 2 deletions grunnur/adapter_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,7 @@ def __call__(
self,
queue_adapter: QueueAdapter,
*args: BufferAdapter | numpy.generic,
local_mem: int = 0,
cu_dynamic_local_mem: int = 0,
) -> None:
# Will be checked in the upper levels
assert isinstance(queue_adapter, CuQueueAdapter) # noqa: S101
Expand All @@ -724,5 +724,5 @@ def __call__(
grid=self._grid,
block=self._block,
stream=queue_adapter._pycuda_stream, # noqa: SLF001
shared=local_mem,
shared=cu_dynamic_local_mem,
)
10 changes: 6 additions & 4 deletions grunnur/adapter_opencl.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,11 +632,13 @@ def __call__(
self,
queue_adapter: QueueAdapter,
*args: BufferAdapter | numpy.generic,
local_mem: int = 0,
cu_dynamic_local_mem: int = 0,
) -> pyopencl.Event:
# Local memory size is passed via regular kernel arguments in OpenCL.
# Should be checked in `PreparedKernel`.
assert local_mem == 0 # noqa: S101
if cu_dynamic_local_mem != 0:
raise ValueError(
"`cu_dynamic_local_mem` must be zero for OpenCL kernels; "
"dynamic local memory allocation is not supported"
)

# We have to keep the signature more general because of the base class,
# but the upper levels will ensure this is the case.
Expand Down
16 changes: 11 additions & 5 deletions grunnur/program.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def __call__(
self,
queue: Queue | MultiQueue,
*args: MultiArray | Array | Buffer | numpy.generic,
local_mem: int = 0,
cu_dynamic_local_mem: int = 0,
) -> Any:
"""
Enqueues the kernel on the devices in the given queue.
Expand All @@ -332,8 +332,10 @@ def __call__(
If an argument is a integer-keyed ``dict``, its values corresponding to the
device indices the kernel is executed on will be passed as kernel arguments.
:param cu_dynamic_local_mem: **CUDA only.** The size of dynamically allocated local
(shared in CUDA terms) memory, in bytes. That is, the size of
``extern __shared__`` arrays in CUDA kernels.
:param args: kernel arguments.
:param kwds: backend-specific keyword parameters.
:returns: a list of ``Event`` objects for enqueued kernels in case of PyOpenCL.
"""
if isinstance(queue, Queue):
Expand All @@ -357,7 +359,11 @@ def __call__(
single_queue = queue.queues[device]

pkernel = self._prepared_kernel_adapters[device]
ret_val = pkernel(single_queue._queue_adapter, *kernel_args, local_mem=local_mem) # noqa: SLF001
ret_val = pkernel(
single_queue._queue_adapter, # noqa: SLF001
*kernel_args,
cu_dynamic_local_mem=cu_dynamic_local_mem,
)
ret_vals.append(ret_val)

return ret_vals
Expand Down Expand Up @@ -455,11 +461,11 @@ def __call__(
global_size: Sequence[int] | Mapping[BoundDevice, Sequence[int]],
local_size: Sequence[int] | None | Mapping[BoundDevice, Sequence[int] | None] = None,
*args: MultiArray | Array | Buffer | numpy.generic,
local_mem: int = 0,
cu_dynamic_local_mem: int = 0,
) -> Any:
"""
A shortcut for :py:meth:`Kernel.prepare` and subsequent :py:meth:`PreparedKernel.__call__`.
See their doc entries for details.
"""
pkernel = self.prepare(global_size, local_size)
return pkernel(queue, *args, local_mem=local_mem)
return pkernel(queue, *args, cu_dynamic_local_mem=cu_dynamic_local_mem)
18 changes: 17 additions & 1 deletion tests/test_program.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def test_compile(mock_or_real_context, no_prelude):
assert (res == ref).all()

# Explicit local_size
res2_dev = Array.from_host(queue, a) # Array.empty(queue, length, numpy.int32)
res2_dev = Array.empty(context.device, [length], numpy.int32)
program.kernel.multiply(queue, [length], [length // 2], res2_dev, a_dev, b_dev, c)
res2 = res2_dev.get(queue)
if not mocked:
Expand Down Expand Up @@ -518,3 +518,19 @@ def test_builtin_globals(mock_backend_pycuda):

assert "max_total_local_size = 1024" in program.sources[context.devices[0]].source
assert "max_total_local_size = 512" in program.sources[context.devices[1]].source


def test_cu_dynamic_local_mem(mock_context):
src = MockDefTemplate(kernels=[MockKernel("test", [numpy.int32])])
program = Program([mock_context.device], src)
queue = Queue(mock_context.device)

if mock_context.api.id == opencl_api_id():
message = (
"`cu_dynamic_local_mem` must be zero for OpenCL kernels; "
"dynamic local memory allocation is not supported"
)
with pytest.raises(ValueError, match=message):
program.kernel.test(queue, [100], [100], numpy.int32(1), cu_dynamic_local_mem=100)
else:
program.kernel.test(queue, [100], [100], numpy.int32(1), cu_dynamic_local_mem=100)

0 comments on commit ec31a91

Please sign in to comment.