diff --git a/grunnur/adapter_base.py b/grunnur/adapter_base.py index 3e861a0..cb7545c 100644 --- a/grunnur/adapter_base.py +++ b/grunnur/adapter_base.py @@ -404,6 +404,6 @@ def __call__( self, queue_adapter: QueueAdapter, *args: BufferAdapter | numpy.generic, - local_mem: int = 0, + cu_dynamic_local_mem: int = 0, ) -> Any: pass diff --git a/grunnur/adapter_cuda.py b/grunnur/adapter_cuda.py index 1fa7b88..3189e41 100644 --- a/grunnur/adapter_cuda.py +++ b/grunnur/adapter_cuda.py @@ -701,7 +701,7 @@ def __call__( self, queue_adapter: QueueAdapter, *args: BufferAdapter | numpy.generic, - local_mem: int = 0, + cu_dynamic_local_mem: int = 0, ) -> None: # Will be checked in the upper levels assert isinstance(queue_adapter, CuQueueAdapter) # noqa: S101 @@ -724,5 +724,5 @@ def __call__( grid=self._grid, block=self._block, stream=queue_adapter._pycuda_stream, # noqa: SLF001 - shared=local_mem, + shared=cu_dynamic_local_mem, ) diff --git a/grunnur/adapter_opencl.py b/grunnur/adapter_opencl.py index 8b100dd..912b062 100644 --- a/grunnur/adapter_opencl.py +++ b/grunnur/adapter_opencl.py @@ -632,11 +632,13 @@ def __call__( self, queue_adapter: QueueAdapter, *args: BufferAdapter | numpy.generic, - local_mem: int = 0, + cu_dynamic_local_mem: int = 0, ) -> pyopencl.Event: - # Local memory size is passed via regular kernel arguments in OpenCL. - # Should be checked in `PreparedKernel`. - assert local_mem == 0 # noqa: S101 + if cu_dynamic_local_mem != 0: + raise ValueError( + "`cu_dynamic_local_mem` must be zero for OpenCL kernels; " + "dynamic local memory allocation is not supported" + ) # We have to keep the signature more general because of the base class, # but the upper levels will ensure this is the case. diff --git a/grunnur/program.py b/grunnur/program.py index 568f7d7..816a6ca 100644 --- a/grunnur/program.py +++ b/grunnur/program.py @@ -314,7 +314,7 @@ def __call__( self, queue: Queue | MultiQueue, *args: MultiArray | Array | Buffer | numpy.generic, - local_mem: int = 0, + cu_dynamic_local_mem: int = 0, ) -> Any: """ Enqueues the kernel on the devices in the given queue. @@ -332,8 +332,10 @@ def __call__( If an argument is a integer-keyed ``dict``, its values corresponding to the device indices the kernel is executed on will be passed as kernel arguments. + :param cu_dynamic_local_mem: **CUDA only.** The size of dynamically allocated local + (shared in CUDA terms) memory, in bytes. That is, the size of + ``extern __shared__`` arrays in CUDA kernels. :param args: kernel arguments. - :param kwds: backend-specific keyword parameters. :returns: a list of ``Event`` objects for enqueued kernels in case of PyOpenCL. """ if isinstance(queue, Queue): @@ -357,7 +359,11 @@ def __call__( single_queue = queue.queues[device] pkernel = self._prepared_kernel_adapters[device] - ret_val = pkernel(single_queue._queue_adapter, *kernel_args, local_mem=local_mem) # noqa: SLF001 + ret_val = pkernel( + single_queue._queue_adapter, # noqa: SLF001 + *kernel_args, + cu_dynamic_local_mem=cu_dynamic_local_mem, + ) ret_vals.append(ret_val) return ret_vals @@ -455,11 +461,11 @@ def __call__( global_size: Sequence[int] | Mapping[BoundDevice, Sequence[int]], local_size: Sequence[int] | None | Mapping[BoundDevice, Sequence[int] | None] = None, *args: MultiArray | Array | Buffer | numpy.generic, - local_mem: int = 0, + cu_dynamic_local_mem: int = 0, ) -> Any: """ A shortcut for :py:meth:`Kernel.prepare` and subsequent :py:meth:`PreparedKernel.__call__`. See their doc entries for details. """ pkernel = self.prepare(global_size, local_size) - return pkernel(queue, *args, local_mem=local_mem) + return pkernel(queue, *args, cu_dynamic_local_mem=cu_dynamic_local_mem) diff --git a/tests/test_program.py b/tests/test_program.py index d45eddb..c85d9c4 100644 --- a/tests/test_program.py +++ b/tests/test_program.py @@ -87,7 +87,7 @@ def test_compile(mock_or_real_context, no_prelude): assert (res == ref).all() # Explicit local_size - res2_dev = Array.from_host(queue, a) # Array.empty(queue, length, numpy.int32) + res2_dev = Array.empty(context.device, [length], numpy.int32) program.kernel.multiply(queue, [length], [length // 2], res2_dev, a_dev, b_dev, c) res2 = res2_dev.get(queue) if not mocked: @@ -518,3 +518,19 @@ def test_builtin_globals(mock_backend_pycuda): assert "max_total_local_size = 1024" in program.sources[context.devices[0]].source assert "max_total_local_size = 512" in program.sources[context.devices[1]].source + + +def test_cu_dynamic_local_mem(mock_context): + src = MockDefTemplate(kernels=[MockKernel("test", [numpy.int32])]) + program = Program([mock_context.device], src) + queue = Queue(mock_context.device) + + if mock_context.api.id == opencl_api_id(): + message = ( + "`cu_dynamic_local_mem` must be zero for OpenCL kernels; " + "dynamic local memory allocation is not supported" + ) + with pytest.raises(ValueError, match=message): + program.kernel.test(queue, [100], [100], numpy.int32(1), cu_dynamic_local_mem=100) + else: + program.kernel.test(queue, [100], [100], numpy.int32(1), cu_dynamic_local_mem=100)