diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 0c03c789..db5f57cf 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -7,7 +7,7 @@ from cuda import cuda, cudart from cuda.core.experimental._context import Context, ContextOptions -from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool +from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool, _SynchronousMemoryResource from cuda.core.experimental._stream import Stream, StreamOptions, default_stream from cuda.core.experimental._utils import ComputeCapability, CUDAError, handle_return, precondition @@ -62,7 +62,17 @@ def __new__(cls, device_id=None): for dev_id in range(total): dev = super().__new__(cls) dev._id = dev_id - dev._mr = _DefaultAsyncMempool(dev_id) + # If the device is in TCC mode, or does not support memory pools for some other reason, + # use the SynchronousMemoryResource which does not use memory pools. + if ( + handle_return( + cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0) + ) + ) == 1: + dev._mr = _DefaultAsyncMempool(dev_id) + else: + dev._mr = _SynchronousMemoryResource(dev_id) + dev._has_inited = False _tls.devices.append(dev) diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 8cc8717e..12fafb39 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -293,3 +293,33 @@ def is_host_accessible(self) -> bool: @property def device_id(self) -> int: raise RuntimeError("the pinned memory resource is not bound to any GPU") + + +class _SynchronousMemoryResource(MemoryResource): + __slots__ = ("_dev_id",) + + def __init__(self, dev_id): + self._handle = None + self._dev_id = dev_id + + def allocate(self, size, stream=None) -> Buffer: + ptr = handle_return(cuda.cuMemAlloc(size)) + return Buffer(ptr, size, self) + + def deallocate(self, ptr, size, stream=None): + if stream is None: + stream = default_stream() + stream.sync() + handle_return(cuda.cuMemFree(ptr)) + + @property + def is_device_accessible(self) -> bool: + return True + + @property + def is_host_accessible(self) -> bool: + return False + + @property + def device_id(self) -> int: + return self._dev_id diff --git a/cuda_core/docs/source/release/0.1.1-notes.md b/cuda_core/docs/source/release/0.1.1-notes.md index 473352a4..41db721a 100644 --- a/cuda_core/docs/source/release/0.1.1-notes.md +++ b/cuda_core/docs/source/release/0.1.1-notes.md @@ -5,7 +5,7 @@ Released on Dec XX, 2024 ## Hightlights - Add `StridedMemoryView` and `@args_viewable_as_strided_memory` that provide a concrete implementation of DLPack & CUDA Array Interface supports. - +- Support TCC devices with a default synchronous memory resource to avoid the use of memory pools ## Limitations