From 053178419f3e8886cdb9336052f5abc8480f5aa6 Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Wed, 8 Sep 2021 19:46:11 -0700 Subject: [PATCH 1/3] support cuda async memory resource in jni --- java/src/main/java/ai/rapids/cudf/Rmm.java | 31 ++++++++++++------- .../ai/rapids/cudf/RmmAllocationMode.java | 4 +++ java/src/main/native/src/RmmJni.cpp | 7 +++++ 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index 97813182deb..0351b66bc16 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -85,7 +85,8 @@ public static LogConf logToStderr() { * @param allocationMode Allocation strategy to use. Bit set using * {@link RmmAllocationMode#CUDA_DEFAULT}, * {@link RmmAllocationMode#POOL}, - * {@link RmmAllocationMode#ARENA} and + * {@link RmmAllocationMode#ARENA}, + * {@link RmmAllocationMode#CUDA_ASYNC} and * {@link RmmAllocationMode#CUDA_MANAGED_MEMORY} * @param enableLogging Enable logging memory manager events * @param poolSize The initial pool size in bytes @@ -106,7 +107,8 @@ public static void initialize(int allocationMode, boolean enableLogging, long po * @param allocationMode Allocation strategy to use. Bit set using * {@link RmmAllocationMode#CUDA_DEFAULT}, * {@link RmmAllocationMode#POOL}, - * {@link RmmAllocationMode#ARENA} and + * {@link RmmAllocationMode#ARENA}, + * {@link RmmAllocationMode#CUDA_ASYNC} and * {@link RmmAllocationMode#CUDA_MANAGED_MEMORY} * @param enableLogging Enable logging memory manager events * @param poolSize The initial pool size in bytes @@ -138,7 +140,8 @@ public static void initialize(int allocationMode, boolean enableLogging, long po * @param allocationMode Allocation strategy to use. Bit set using * {@link RmmAllocationMode#CUDA_DEFAULT}, * {@link RmmAllocationMode#POOL}, - * {@link RmmAllocationMode#ARENA} and + * {@link RmmAllocationMode#ARENA}, + * {@link RmmAllocationMode#CUDA_ASYNC} and * {@link RmmAllocationMode#CUDA_MANAGED_MEMORY} * @param logConf How to do logging or null if you don't want to * @param poolSize The initial pool size in bytes @@ -159,7 +162,8 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, * @param allocationMode Allocation strategy to use. Bit set using * {@link RmmAllocationMode#CUDA_DEFAULT}, * {@link RmmAllocationMode#POOL}, - * {@link RmmAllocationMode#ARENA} and + * {@link RmmAllocationMode#ARENA}, + * {@link RmmAllocationMode#CUDA_ASYNC} and * {@link RmmAllocationMode#CUDA_MANAGED_MEMORY} * @param logConf How to do logging or null if you don't want to * @param poolSize The initial pool size in bytes @@ -168,8 +172,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, * @throws IllegalStateException if RMM has already been initialized * @throws IllegalArgumentException if a max pool size is specified but the allocation mode * is not {@link RmmAllocationMode#POOL} or - * {@link RmmAllocationMode#ARENA}, or the maximum pool size is - * below the initial size. + * {@link RmmAllocationMode#ARENA} or + * {@link RmmAllocationMode#CUDA_ASYNC}, or the maximum pool + * size is below the initial size. */ public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize, long maxPoolSize) throws RmmException { @@ -186,7 +191,8 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, * @param allocationMode Allocation strategy to use. Bit set using * {@link RmmAllocationMode#CUDA_DEFAULT}, * {@link RmmAllocationMode#POOL}, - * {@link RmmAllocationMode#ARENA} and + * {@link RmmAllocationMode#ARENA}, + * {@link RmmAllocationMode#CUDA_ASYNC} and * {@link RmmAllocationMode#CUDA_MANAGED_MEMORY} * @param logConf How to do logging or null if you don't want to * @param poolSize The initial pool size in bytes @@ -198,8 +204,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, * @throws IllegalStateException if RMM has already been initialized * @throws IllegalArgumentException if a max pool size is specified but the allocation mode * is not {@link RmmAllocationMode#POOL} or - * {@link RmmAllocationMode#ARENA}, or the maximum pool size is - * below the initial size. + * {@link RmmAllocationMode#ARENA} or + * {@link RmmAllocationMode#CUDA_ASYNC}, or the maximum pool + * size is below the initial size. */ public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize, long maxPoolSize, long allocationAlignment, long alignmentThreshold) throws RmmException { @@ -207,9 +214,11 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, throw new IllegalStateException("RMM is already initialized"); } if (maxPoolSize > 0) { - if (allocationMode != RmmAllocationMode.POOL && allocationMode != RmmAllocationMode.ARENA) { + if (allocationMode != RmmAllocationMode.POOL && + allocationMode != RmmAllocationMode.ARENA && + allocationMode != RmmAllocationMode.CUDA_ASYNC) { throw new IllegalArgumentException( - "Pool limit only supported in POOL or ARENA allocation mode"); + "Pool limit only supported in POOL or ARENA or CUDA_ASYNC allocation mode"); } if (maxPoolSize < poolSize) { throw new IllegalArgumentException("Pool limit of " + maxPoolSize diff --git a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java index 89230a06dd7..9a48f630a2e 100644 --- a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java +++ b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java @@ -32,4 +32,8 @@ public class RmmAllocationMode { * Use arena suballocation strategy */ public static final int ARENA = 0x00000004; + /** + * Use CUDA async suballocation strategy + */ + public static final int CUDA_ASYNC = 0x00000008; } diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 0105f8c43ca..70c1e1a6d1e 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -344,6 +345,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal( bool use_pool_alloc = allocation_mode & 1; bool use_managed_mem = allocation_mode & 2; bool use_arena_alloc = allocation_mode & 4; + bool use_cuda_async_alloc = allocation_mode & 8; if (use_pool_alloc) { auto pool_limit = (max_pool_size > 0) ? thrust::optional{static_cast(max_pool_size)} : @@ -365,6 +367,11 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal( Initialized_resource = rmm::mr::make_owning_wrapper( std::make_shared(), pool_size, pool_limit); } + } else if (use_cuda_async_alloc) { + auto pool_limit = (max_pool_size > 0) ? thrust::optional{max_pool_size} : + thrust::optional{}; + Initialized_resource = std::make_shared( + thrust::optional{pool_size}, pool_limit); } else if (use_managed_mem) { Initialized_resource = std::make_shared(); } else { From 75ac47b8ab58d1ba2d38767dddfd4c2035287c2c Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Thu, 9 Sep 2021 13:39:57 -0700 Subject: [PATCH 2/3] review feedback --- java/src/main/java/ai/rapids/cudf/Rmm.java | 6 +-- .../ai/rapids/cudf/RmmAllocationMode.java | 2 +- java/src/main/native/src/RmmJni.cpp | 4 +- .../src/test/java/ai/rapids/cudf/RmmTest.java | 47 +++++++++++++++---- 4 files changed, 44 insertions(+), 15 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index 0351b66bc16..1633afe9786 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -214,9 +214,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, throw new IllegalStateException("RMM is already initialized"); } if (maxPoolSize > 0) { - if (allocationMode != RmmAllocationMode.POOL && - allocationMode != RmmAllocationMode.ARENA && - allocationMode != RmmAllocationMode.CUDA_ASYNC) { + if ((allocationMode & RmmAllocationMode.POOL) == 0 && + (allocationMode & RmmAllocationMode.ARENA) == 0 && + (allocationMode & RmmAllocationMode.CUDA_ASYNC) == 0) { throw new IllegalArgumentException( "Pool limit only supported in POOL or ARENA or CUDA_ASYNC allocation mode"); } diff --git a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java index 9a48f630a2e..966c21bee22 100644 --- a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java +++ b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 70c1e1a6d1e..b9d99dc5d42 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -370,8 +370,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal( } else if (use_cuda_async_alloc) { auto pool_limit = (max_pool_size > 0) ? thrust::optional{max_pool_size} : thrust::optional{}; - Initialized_resource = std::make_shared( - thrust::optional{pool_size}, pool_limit); + Initialized_resource = + std::make_shared(pool_size, pool_limit); } else if (use_managed_mem) { Initialized_resource = std::make_shared(); } else { diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java index d639c4849d3..f2b1f267e78 100644 --- a/java/src/test/java/ai/rapids/cudf/RmmTest.java +++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,6 +32,7 @@ import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; +import static org.junit.jupiter.api.Assumptions.assumeFalse; public class RmmTest { private static final long TOO_MUCH_MEMORY = 3L * 1024 * 1024 * 1024 * 1024 * 1024 * 1024; @@ -51,18 +52,24 @@ public void teardown() { } @ParameterizedTest - @ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL}) + @ValueSource(ints = { + RmmAllocationMode.CUDA_DEFAULT, + RmmAllocationMode.POOL, + RmmAllocationMode.ARENA}) public void testTotalAllocated(int rmmAllocMode) { Rmm.initialize(rmmAllocMode, false, 512 * 1024 * 1024); assertEquals(0, Rmm.getTotalBytesAllocated()); - try (DeviceMemoryBuffer addr = Rmm.alloc(1024)) { + try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) { assertEquals(1024, Rmm.getTotalBytesAllocated()); } assertEquals(0, Rmm.getTotalBytesAllocated()); } @ParameterizedTest - @ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL}) + @ValueSource(ints = { + RmmAllocationMode.CUDA_DEFAULT, + RmmAllocationMode.POOL, + RmmAllocationMode.ARENA}) public void testEventHandler(int rmmAllocMode) { AtomicInteger invokedCount = new AtomicInteger(); AtomicLong amountRequested = new AtomicLong(); @@ -328,7 +335,7 @@ public void onDeallocThreshold(long totalAllocSize) { Rmm.setEventHandler(handler); DeviceMemoryBuffer addr = Rmm.alloc(6 * 1024); - assertThrows(DeallocThresholdException.class, () -> addr.close()); + assertThrows(DeallocThresholdException.class, addr::close); assertThrows(AllocThresholdException.class, () -> Rmm.alloc(12 * 1024)); assertThrows(AllocFailException.class, () -> Rmm.alloc(TOO_MUCH_MEMORY)); } @@ -356,7 +363,10 @@ public void testThreadAutoDeviceSetup() throws Exception { } @ParameterizedTest - @ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL}) + @ValueSource(ints = { + RmmAllocationMode.CUDA_DEFAULT, + RmmAllocationMode.POOL, + RmmAllocationMode.ARENA}) public void testSetDeviceThrowsAfterRmmInit(int rmmAllocMode) { Rmm.initialize(rmmAllocMode, false, 1024 * 1024); assertThrows(CudfException.class, () -> Cuda.setDevice(Cuda.getDevice() + 1)); @@ -399,9 +409,28 @@ public void testPoolLimitNonPoolMode() { () -> Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, false, 1024, 2048)); } - private static class AllocFailException extends RuntimeException {} - private static class AllocThresholdException extends RuntimeException {} - private static class DeallocThresholdException extends RuntimeException {} + @Test + public void testCudaAsyncMemoryResource() { + try { + Rmm.initialize(RmmAllocationMode.CUDA_ASYNC, false, 1024 * 1024L, 1024 * 1024L); + } catch (CudfException e) { + assumeFalse(e.getMessage().contains("cudaMallocAsync not supported")); + throw e; + } + DeviceMemoryBuffer buff = Rmm.alloc(1024); + buff.close(); + buff = Rmm.alloc(2048); + buff.close(); + } + + private static class AllocFailException extends RuntimeException { + } + + private static class AllocThresholdException extends RuntimeException { + } + + private static class DeallocThresholdException extends RuntimeException { + } private static abstract class BaseRmmEventHandler implements RmmEventHandler { @Override From 989a43b4068b5af1336b08bdece2645a02f65c93 Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Thu, 9 Sep 2021 17:17:34 -0700 Subject: [PATCH 3/3] add hard limit --- java/src/main/java/ai/rapids/cudf/Rmm.java | 16 +++++++++--- java/src/main/native/src/RmmJni.cpp | 15 ++++++++--- .../src/test/java/ai/rapids/cudf/RmmTest.java | 25 ++++++++++++++----- 3 files changed, 42 insertions(+), 14 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index 1633afe9786..70d5bcbb501 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -213,18 +213,26 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, if (initialized) { throw new IllegalStateException("RMM is already initialized"); } + + boolean isPool = (allocationMode & RmmAllocationMode.POOL) != 0; + boolean isArena = (allocationMode & RmmAllocationMode.ARENA) != 0; + boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0; + boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0; + if (maxPoolSize > 0) { - if ((allocationMode & RmmAllocationMode.POOL) == 0 && - (allocationMode & RmmAllocationMode.ARENA) == 0 && - (allocationMode & RmmAllocationMode.CUDA_ASYNC) == 0) { + if (!isPool && !isArena && !isAsync) { throw new IllegalArgumentException( - "Pool limit only supported in POOL or ARENA or CUDA_ASYNC allocation mode"); + "Pool limit only supported in POOL, ARENA, or CUDA_ASYNC allocation mode"); } if (maxPoolSize < poolSize) { throw new IllegalArgumentException("Pool limit of " + maxPoolSize + " is less than initial pool size of " + poolSize); } } + if (isAsync && isManaged) { + throw new IllegalArgumentException( + "CUDA Unified Memory is not supported in CUDA_ASYNC allocation mode"); + } LogLoc loc = LogLoc.NONE; String path = null; if (logConf != null) { diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index b9d99dc5d42..59469dd415c 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -368,10 +369,16 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal( std::make_shared(), pool_size, pool_limit); } } else if (use_cuda_async_alloc) { - auto pool_limit = (max_pool_size > 0) ? thrust::optional{max_pool_size} : - thrust::optional{}; - Initialized_resource = - std::make_shared(pool_size, pool_limit); + auto const pool_limit = max_pool_size > 0 ? static_cast(max_pool_size) : + std::numeric_limits::max(); + auto const release_threshold = max_pool_size > 0 ? + thrust::optional{max_pool_size} : + thrust::optional{}; + // Use `limiting_resource_adaptor` to set a hard limit on the max pool size since + // `cuda_async_memory_resource` only has a release threshold. + Initialized_resource = rmm::mr::make_owning_wrapper( + std::make_shared(pool_size, release_threshold), + pool_limit); } else if (use_managed_mem) { Initialized_resource = std::make_shared(); } else { diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java index f2b1f267e78..d1273b4b7c1 100644 --- a/java/src/test/java/ai/rapids/cudf/RmmTest.java +++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java @@ -410,17 +410,30 @@ public void testPoolLimitNonPoolMode() { } @Test - public void testCudaAsyncMemoryResource() { + public void testCudaAsyncMemoryResourceLimit() { try { - Rmm.initialize(RmmAllocationMode.CUDA_ASYNC, false, 1024 * 1024L, 1024 * 1024L); + Rmm.initialize(RmmAllocationMode.CUDA_ASYNC, false, 1024, 2048); } catch (CudfException e) { + // CUDA 11.2 introduced cudaMallocAsync, older CUDA Toolkit will skip this test. assumeFalse(e.getMessage().contains("cudaMallocAsync not supported")); throw e; } - DeviceMemoryBuffer buff = Rmm.alloc(1024); - buff.close(); - buff = Rmm.alloc(2048); - buff.close(); + try (DeviceMemoryBuffer ignored1 = Rmm.alloc(512); + DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) { + assertThrows(OutOfMemoryError.class, + () -> { + DeviceMemoryBuffer ignored3 = Rmm.alloc(1024); + ignored3.close(); + }); + } + } + + @Test + public void testCudaAsyncIsIncompatibleWithManaged() { + assertThrows(IllegalArgumentException.class, + () -> Rmm.initialize( + RmmAllocationMode.CUDA_ASYNC | RmmAllocationMode.CUDA_MANAGED_MEMORY, + false, 1024, 2048)); } private static class AllocFailException extends RuntimeException {