diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index 97813182deb..70d5bcbb501 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -85,7 +85,8 @@ public static LogConf logToStderr() { * @param allocationMode Allocation strategy to use. Bit set using * {@link RmmAllocationMode#CUDA_DEFAULT}, * {@link RmmAllocationMode#POOL}, - * {@link RmmAllocationMode#ARENA} and + * {@link RmmAllocationMode#ARENA}, + * {@link RmmAllocationMode#CUDA_ASYNC} and * {@link RmmAllocationMode#CUDA_MANAGED_MEMORY} * @param enableLogging Enable logging memory manager events * @param poolSize The initial pool size in bytes @@ -106,7 +107,8 @@ public static void initialize(int allocationMode, boolean enableLogging, long po * @param allocationMode Allocation strategy to use. Bit set using * {@link RmmAllocationMode#CUDA_DEFAULT}, * {@link RmmAllocationMode#POOL}, - * {@link RmmAllocationMode#ARENA} and + * {@link RmmAllocationMode#ARENA}, + * {@link RmmAllocationMode#CUDA_ASYNC} and * {@link RmmAllocationMode#CUDA_MANAGED_MEMORY} * @param enableLogging Enable logging memory manager events * @param poolSize The initial pool size in bytes @@ -138,7 +140,8 @@ public static void initialize(int allocationMode, boolean enableLogging, long po * @param allocationMode Allocation strategy to use. Bit set using * {@link RmmAllocationMode#CUDA_DEFAULT}, * {@link RmmAllocationMode#POOL}, - * {@link RmmAllocationMode#ARENA} and + * {@link RmmAllocationMode#ARENA}, + * {@link RmmAllocationMode#CUDA_ASYNC} and * {@link RmmAllocationMode#CUDA_MANAGED_MEMORY} * @param logConf How to do logging or null if you don't want to * @param poolSize The initial pool size in bytes @@ -159,7 +162,8 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, * @param allocationMode Allocation strategy to use. Bit set using * {@link RmmAllocationMode#CUDA_DEFAULT}, * {@link RmmAllocationMode#POOL}, - * {@link RmmAllocationMode#ARENA} and + * {@link RmmAllocationMode#ARENA}, + * {@link RmmAllocationMode#CUDA_ASYNC} and * {@link RmmAllocationMode#CUDA_MANAGED_MEMORY} * @param logConf How to do logging or null if you don't want to * @param poolSize The initial pool size in bytes @@ -168,8 +172,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, * @throws IllegalStateException if RMM has already been initialized * @throws IllegalArgumentException if a max pool size is specified but the allocation mode * is not {@link RmmAllocationMode#POOL} or - * {@link RmmAllocationMode#ARENA}, or the maximum pool size is - * below the initial size. + * {@link RmmAllocationMode#ARENA} or + * {@link RmmAllocationMode#CUDA_ASYNC}, or the maximum pool + * size is below the initial size. */ public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize, long maxPoolSize) throws RmmException { @@ -186,7 +191,8 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, * @param allocationMode Allocation strategy to use. Bit set using * {@link RmmAllocationMode#CUDA_DEFAULT}, * {@link RmmAllocationMode#POOL}, - * {@link RmmAllocationMode#ARENA} and + * {@link RmmAllocationMode#ARENA}, + * {@link RmmAllocationMode#CUDA_ASYNC} and * {@link RmmAllocationMode#CUDA_MANAGED_MEMORY} * @param logConf How to do logging or null if you don't want to * @param poolSize The initial pool size in bytes @@ -198,24 +204,35 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, * @throws IllegalStateException if RMM has already been initialized * @throws IllegalArgumentException if a max pool size is specified but the allocation mode * is not {@link RmmAllocationMode#POOL} or - * {@link RmmAllocationMode#ARENA}, or the maximum pool size is - * below the initial size. + * {@link RmmAllocationMode#ARENA} or + * {@link RmmAllocationMode#CUDA_ASYNC}, or the maximum pool + * size is below the initial size. */ public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize, long maxPoolSize, long allocationAlignment, long alignmentThreshold) throws RmmException { if (initialized) { throw new IllegalStateException("RMM is already initialized"); } + + boolean isPool = (allocationMode & RmmAllocationMode.POOL) != 0; + boolean isArena = (allocationMode & RmmAllocationMode.ARENA) != 0; + boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0; + boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0; + if (maxPoolSize > 0) { - if (allocationMode != RmmAllocationMode.POOL && allocationMode != RmmAllocationMode.ARENA) { + if (!isPool && !isArena && !isAsync) { throw new IllegalArgumentException( - "Pool limit only supported in POOL or ARENA allocation mode"); + "Pool limit only supported in POOL, ARENA, or CUDA_ASYNC allocation mode"); } if (maxPoolSize < poolSize) { throw new IllegalArgumentException("Pool limit of " + maxPoolSize + " is less than initial pool size of " + poolSize); } } + if (isAsync && isManaged) { + throw new IllegalArgumentException( + "CUDA Unified Memory is not supported in CUDA_ASYNC allocation mode"); + } LogLoc loc = LogLoc.NONE; String path = null; if (logConf != null) { diff --git a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java index 89230a06dd7..966c21bee22 100644 --- a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java +++ b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,4 +32,8 @@ public class RmmAllocationMode { * Use arena suballocation strategy */ public static final int ARENA = 0x00000004; + /** + * Use CUDA async suballocation strategy + */ + public static final int CUDA_ASYNC = 0x00000008; } diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 0105f8c43ca..59469dd415c 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -22,7 +22,9 @@ #include #include +#include #include +#include #include #include #include @@ -344,6 +346,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal( bool use_pool_alloc = allocation_mode & 1; bool use_managed_mem = allocation_mode & 2; bool use_arena_alloc = allocation_mode & 4; + bool use_cuda_async_alloc = allocation_mode & 8; if (use_pool_alloc) { auto pool_limit = (max_pool_size > 0) ? thrust::optional{static_cast(max_pool_size)} : @@ -365,6 +368,17 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal( Initialized_resource = rmm::mr::make_owning_wrapper( std::make_shared(), pool_size, pool_limit); } + } else if (use_cuda_async_alloc) { + auto const pool_limit = max_pool_size > 0 ? static_cast(max_pool_size) : + std::numeric_limits::max(); + auto const release_threshold = max_pool_size > 0 ? + thrust::optional{max_pool_size} : + thrust::optional{}; + // Use `limiting_resource_adaptor` to set a hard limit on the max pool size since + // `cuda_async_memory_resource` only has a release threshold. + Initialized_resource = rmm::mr::make_owning_wrapper( + std::make_shared(pool_size, release_threshold), + pool_limit); } else if (use_managed_mem) { Initialized_resource = std::make_shared(); } else { diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java index d639c4849d3..d1273b4b7c1 100644 --- a/java/src/test/java/ai/rapids/cudf/RmmTest.java +++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,6 +32,7 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; +import static org.junit.jupiter.api.Assumptions.assumeFalse; public class RmmTest { private static final long TOO_MUCH_MEMORY = 3L * 1024 * 1024 * 1024 * 1024 * 1024 * 1024; @@ -51,18 +52,24 @@ public void teardown() { } @ParameterizedTest - @ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL}) + @ValueSource(ints = { + RmmAllocationMode.CUDA_DEFAULT, + RmmAllocationMode.POOL, + RmmAllocationMode.ARENA}) public void testTotalAllocated(int rmmAllocMode) { Rmm.initialize(rmmAllocMode, false, 512 * 1024 * 1024); assertEquals(0, Rmm.getTotalBytesAllocated()); - try (DeviceMemoryBuffer addr = Rmm.alloc(1024)) { + try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) { assertEquals(1024, Rmm.getTotalBytesAllocated()); } assertEquals(0, Rmm.getTotalBytesAllocated()); } @ParameterizedTest - @ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL}) + @ValueSource(ints = { + RmmAllocationMode.CUDA_DEFAULT, + RmmAllocationMode.POOL, + RmmAllocationMode.ARENA}) public void testEventHandler(int rmmAllocMode) { AtomicInteger invokedCount = new AtomicInteger(); AtomicLong amountRequested = new AtomicLong(); @@ -328,7 +335,7 @@ public void onDeallocThreshold(long totalAllocSize) { Rmm.setEventHandler(handler); DeviceMemoryBuffer addr = Rmm.alloc(6 * 1024); - assertThrows(DeallocThresholdException.class, () -> addr.close()); + assertThrows(DeallocThresholdException.class, addr::close); assertThrows(AllocThresholdException.class, () -> Rmm.alloc(12 * 1024)); assertThrows(AllocFailException.class, () -> Rmm.alloc(TOO_MUCH_MEMORY)); } @@ -356,7 +363,10 @@ public void testThreadAutoDeviceSetup() throws Exception { } @ParameterizedTest - @ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL}) + @ValueSource(ints = { + RmmAllocationMode.CUDA_DEFAULT, + RmmAllocationMode.POOL, + RmmAllocationMode.ARENA}) public void testSetDeviceThrowsAfterRmmInit(int rmmAllocMode) { Rmm.initialize(rmmAllocMode, false, 1024 * 1024); assertThrows(CudfException.class, () -> Cuda.setDevice(Cuda.getDevice() + 1)); @@ -399,9 +409,41 @@ public void testPoolLimitNonPoolMode() { () -> Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, false, 1024, 2048)); } - private static class AllocFailException extends RuntimeException {} - private static class AllocThresholdException extends RuntimeException {} - private static class DeallocThresholdException extends RuntimeException {} + @Test + public void testCudaAsyncMemoryResourceLimit() { + try { + Rmm.initialize(RmmAllocationMode.CUDA_ASYNC, false, 1024, 2048); + } catch (CudfException e) { + // CUDA 11.2 introduced cudaMallocAsync, older CUDA Toolkit will skip this test. + assumeFalse(e.getMessage().contains("cudaMallocAsync not supported")); + throw e; + } + try (DeviceMemoryBuffer ignored1 = Rmm.alloc(512); + DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) { + assertThrows(OutOfMemoryError.class, + () -> { + DeviceMemoryBuffer ignored3 = Rmm.alloc(1024); + ignored3.close(); + }); + } + } + + @Test + public void testCudaAsyncIsIncompatibleWithManaged() { + assertThrows(IllegalArgumentException.class, + () -> Rmm.initialize( + RmmAllocationMode.CUDA_ASYNC | RmmAllocationMode.CUDA_MANAGED_MEMORY, + false, 1024, 2048)); + } + + private static class AllocFailException extends RuntimeException { + } + + private static class AllocThresholdException extends RuntimeException { + } + + private static class DeallocThresholdException extends RuntimeException { + } private static abstract class BaseRmmEventHandler implements RmmEventHandler { @Override