From 053178419f3e8886cdb9336052f5abc8480f5aa6 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 8 Sep 2021 19:46:11 -0700
Subject: [PATCH 1/3] support cuda async memory resource in jni

---
 java/src/main/java/ai/rapids/cudf/Rmm.java    | 31 ++++++++++++-------
 .../ai/rapids/cudf/RmmAllocationMode.java     |  4 +++
 java/src/main/native/src/RmmJni.cpp           |  7 +++++
 3 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index 97813182deb..0351b66bc16 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -85,7 +85,8 @@ public static LogConf logToStderr() {
    * @param allocationMode Allocation strategy to use. Bit set using
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA} and
+   *                       {@link RmmAllocationMode#ARENA},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param enableLogging  Enable logging memory manager events
    * @param poolSize       The initial pool size in bytes
@@ -106,7 +107,8 @@ public static void initialize(int allocationMode, boolean enableLogging, long po
    * @param allocationMode Allocation strategy to use. Bit set using
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA} and
+   *                       {@link RmmAllocationMode#ARENA},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param enableLogging  Enable logging memory manager events
    * @param poolSize       The initial pool size in bytes
@@ -138,7 +140,8 @@ public static void initialize(int allocationMode, boolean enableLogging, long po
    * @param allocationMode Allocation strategy to use. Bit set using
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA} and
+   *                       {@link RmmAllocationMode#ARENA},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param logConf        How to do logging or null if you don't want to
    * @param poolSize       The initial pool size in bytes
@@ -159,7 +162,8 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
    * @param allocationMode Allocation strategy to use. Bit set using
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA} and
+   *                       {@link RmmAllocationMode#ARENA},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param logConf        How to do logging or null if you don't want to
    * @param poolSize       The initial pool size in bytes
@@ -168,8 +172,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
    * @throws IllegalStateException if RMM has already been initialized
    * @throws IllegalArgumentException if a max pool size is specified but the allocation mode
    *                                  is not {@link RmmAllocationMode#POOL} or
-   *                                  {@link RmmAllocationMode#ARENA}, or the maximum pool size is
-   *                                  below the initial size.
+   *                                  {@link RmmAllocationMode#ARENA} or
+   *                                  {@link RmmAllocationMode#CUDA_ASYNC}, or the maximum pool
+   *                                  size is below the initial size.
    */
   public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize,
       long maxPoolSize) throws RmmException {
@@ -186,7 +191,8 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
    * @param allocationMode Allocation strategy to use. Bit set using
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA} and
+   *                       {@link RmmAllocationMode#ARENA},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param logConf        How to do logging or null if you don't want to
    * @param poolSize       The initial pool size in bytes
@@ -198,8 +204,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
    * @throws IllegalStateException if RMM has already been initialized
    * @throws IllegalArgumentException if a max pool size is specified but the allocation mode
    *                                  is not {@link RmmAllocationMode#POOL} or
-   *                                  {@link RmmAllocationMode#ARENA}, or the maximum pool size is
-   *                                  below the initial size.
+   *                                  {@link RmmAllocationMode#ARENA} or
+   *                                  {@link RmmAllocationMode#CUDA_ASYNC}, or the maximum pool
+   *                                  size is below the initial size.
    */
   public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize,
       long maxPoolSize, long allocationAlignment, long alignmentThreshold) throws RmmException {
@@ -207,9 +214,11 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
       throw new IllegalStateException("RMM is already initialized");
     }
     if (maxPoolSize > 0) {
-      if (allocationMode != RmmAllocationMode.POOL && allocationMode != RmmAllocationMode.ARENA) {
+      if (allocationMode != RmmAllocationMode.POOL &&
+          allocationMode != RmmAllocationMode.ARENA &&
+          allocationMode != RmmAllocationMode.CUDA_ASYNC) {
         throw new IllegalArgumentException(
-                "Pool limit only supported in POOL or ARENA allocation mode");
+            "Pool limit only supported in POOL or ARENA or CUDA_ASYNC allocation mode");
       }
       if (maxPoolSize < poolSize) {
         throw new IllegalArgumentException("Pool limit of " + maxPoolSize
diff --git a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
index 89230a06dd7..9a48f630a2e 100644
--- a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
+++ b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
@@ -32,4 +32,8 @@ public class RmmAllocationMode {
    * Use arena suballocation strategy
    */
   public static final int ARENA = 0x00000004;
+  /**
+   * Use CUDA async suballocation strategy
+   */
+  public static final int CUDA_ASYNC = 0x00000008;
 }
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 0105f8c43ca..70c1e1a6d1e 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -22,6 +22,7 @@
 
 #include <rmm/mr/device/aligned_resource_adaptor.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/logging_resource_adaptor.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
@@ -344,6 +345,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(
     bool use_pool_alloc = allocation_mode & 1;
     bool use_managed_mem = allocation_mode & 2;
     bool use_arena_alloc = allocation_mode & 4;
+    bool use_cuda_async_alloc = allocation_mode & 8;
     if (use_pool_alloc) {
       auto pool_limit = (max_pool_size > 0) ?
                             thrust::optional<std::size_t>{static_cast<std::size_t>(max_pool_size)} :
@@ -365,6 +367,11 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
             std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_limit);
       }
+    } else if (use_cuda_async_alloc) {
+      auto pool_limit = (max_pool_size > 0) ? thrust::optional<std::size_t>{max_pool_size} :
+                                              thrust::optional<std::size_t>{};
+      Initialized_resource = std::make_shared<rmm::mr::cuda_async_memory_resource>(
+            thrust::optional<std::size_t>{pool_size}, pool_limit);
     } else if (use_managed_mem) {
       Initialized_resource = std::make_shared<rmm::mr::managed_memory_resource>();
     } else {

From 75ac47b8ab58d1ba2d38767dddfd4c2035287c2c Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 9 Sep 2021 13:39:57 -0700
Subject: [PATCH 2/3] review feedback

---
 java/src/main/java/ai/rapids/cudf/Rmm.java    |  6 +--
 .../ai/rapids/cudf/RmmAllocationMode.java     |  2 +-
 java/src/main/native/src/RmmJni.cpp           |  4 +-
 .../src/test/java/ai/rapids/cudf/RmmTest.java | 47 +++++++++++++++----
 4 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index 0351b66bc16..1633afe9786 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -214,9 +214,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
       throw new IllegalStateException("RMM is already initialized");
     }
     if (maxPoolSize > 0) {
-      if (allocationMode != RmmAllocationMode.POOL &&
-          allocationMode != RmmAllocationMode.ARENA &&
-          allocationMode != RmmAllocationMode.CUDA_ASYNC) {
+      if ((allocationMode & RmmAllocationMode.POOL) == 0 &&
+          (allocationMode & RmmAllocationMode.ARENA) == 0 &&
+          (allocationMode & RmmAllocationMode.CUDA_ASYNC) == 0) {
         throw new IllegalArgumentException(
             "Pool limit only supported in POOL or ARENA or CUDA_ASYNC allocation mode");
       }
diff --git a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
index 9a48f630a2e..966c21bee22 100644
--- a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
+++ b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 70c1e1a6d1e..b9d99dc5d42 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -370,8 +370,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(
     } else if (use_cuda_async_alloc) {
       auto pool_limit = (max_pool_size > 0) ? thrust::optional<std::size_t>{max_pool_size} :
                                               thrust::optional<std::size_t>{};
-      Initialized_resource = std::make_shared<rmm::mr::cuda_async_memory_resource>(
-            thrust::optional<std::size_t>{pool_size}, pool_limit);
+      Initialized_resource =
+          std::make_shared<rmm::mr::cuda_async_memory_resource>(pool_size, pool_limit);
     } else if (use_managed_mem) {
       Initialized_resource = std::make_shared<rmm::mr::managed_memory_resource>();
     } else {
diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java
index d639c4849d3..f2b1f267e78 100644
--- a/java/src/test/java/ai/rapids/cudf/RmmTest.java
+++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@
 import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assertions.fail;
+import static org.junit.jupiter.api.Assumptions.assumeFalse;
 
 public class RmmTest {
   private static final long TOO_MUCH_MEMORY = 3L * 1024 * 1024 * 1024 * 1024 * 1024 * 1024;
@@ -51,18 +52,24 @@ public void teardown() {
   }
 
   @ParameterizedTest
-  @ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL})
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
   public void testTotalAllocated(int rmmAllocMode) {
     Rmm.initialize(rmmAllocMode, false, 512 * 1024 * 1024);
     assertEquals(0, Rmm.getTotalBytesAllocated());
-    try (DeviceMemoryBuffer addr = Rmm.alloc(1024)) {
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) {
       assertEquals(1024, Rmm.getTotalBytesAllocated());
     }
     assertEquals(0, Rmm.getTotalBytesAllocated());
   }
 
   @ParameterizedTest
-  @ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL})
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
   public void testEventHandler(int rmmAllocMode) {
     AtomicInteger invokedCount = new AtomicInteger();
     AtomicLong amountRequested = new AtomicLong();
@@ -328,7 +335,7 @@ public void onDeallocThreshold(long totalAllocSize) {
 
     Rmm.setEventHandler(handler);
     DeviceMemoryBuffer addr = Rmm.alloc(6 * 1024);
-    assertThrows(DeallocThresholdException.class, () -> addr.close());
+    assertThrows(DeallocThresholdException.class, addr::close);
     assertThrows(AllocThresholdException.class, () -> Rmm.alloc(12 * 1024));
     assertThrows(AllocFailException.class, () -> Rmm.alloc(TOO_MUCH_MEMORY));
   }
@@ -356,7 +363,10 @@ public void testThreadAutoDeviceSetup() throws Exception {
   }
 
   @ParameterizedTest
-  @ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL})
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
   public void testSetDeviceThrowsAfterRmmInit(int rmmAllocMode) {
     Rmm.initialize(rmmAllocMode, false, 1024 * 1024);
     assertThrows(CudfException.class, () -> Cuda.setDevice(Cuda.getDevice() + 1));
@@ -399,9 +409,28 @@ public void testPoolLimitNonPoolMode() {
         () -> Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, false, 1024, 2048));
   }
 
-  private static class AllocFailException extends RuntimeException {}
-  private static class AllocThresholdException extends RuntimeException {}
-  private static class DeallocThresholdException extends RuntimeException {}
+  @Test
+  public void testCudaAsyncMemoryResource() {
+    try {
+      Rmm.initialize(RmmAllocationMode.CUDA_ASYNC, false, 1024 * 1024L, 1024 * 1024L);
+    } catch (CudfException e) {
+      assumeFalse(e.getMessage().contains("cudaMallocAsync not supported"));
+      throw e;
+    }
+    DeviceMemoryBuffer buff = Rmm.alloc(1024);
+    buff.close();
+    buff = Rmm.alloc(2048);
+    buff.close();
+  }
+
+  private static class AllocFailException extends RuntimeException {
+  }
+
+  private static class AllocThresholdException extends RuntimeException {
+  }
+
+  private static class DeallocThresholdException extends RuntimeException {
+  }
 
   private static abstract class BaseRmmEventHandler implements RmmEventHandler {
     @Override

From 989a43b4068b5af1336b08bdece2645a02f65c93 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 9 Sep 2021 17:17:34 -0700
Subject: [PATCH 3/3] add hard limit

---
 java/src/main/java/ai/rapids/cudf/Rmm.java    | 16 +++++++++---
 java/src/main/native/src/RmmJni.cpp           | 15 ++++++++---
 .../src/test/java/ai/rapids/cudf/RmmTest.java | 25 ++++++++++++++-----
 3 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index 1633afe9786..70d5bcbb501 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -213,18 +213,26 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
     if (initialized) {
       throw new IllegalStateException("RMM is already initialized");
     }
+
+    boolean isPool = (allocationMode & RmmAllocationMode.POOL) != 0;
+    boolean isArena = (allocationMode & RmmAllocationMode.ARENA) != 0;
+    boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0;
+    boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0;
+
     if (maxPoolSize > 0) {
-      if ((allocationMode & RmmAllocationMode.POOL) == 0 &&
-          (allocationMode & RmmAllocationMode.ARENA) == 0 &&
-          (allocationMode & RmmAllocationMode.CUDA_ASYNC) == 0) {
+      if (!isPool && !isArena && !isAsync) {
         throw new IllegalArgumentException(
-            "Pool limit only supported in POOL or ARENA or CUDA_ASYNC allocation mode");
+            "Pool limit only supported in POOL, ARENA, or CUDA_ASYNC allocation mode");
       }
       if (maxPoolSize < poolSize) {
         throw new IllegalArgumentException("Pool limit of " + maxPoolSize
             + " is less than initial pool size of " + poolSize);
       }
     }
+    if (isAsync && isManaged) {
+      throw new IllegalArgumentException(
+          "CUDA Unified Memory is not supported in CUDA_ASYNC allocation mode");
+    }
     LogLoc loc = LogLoc.NONE;
     String path = null;
     if (logConf != null) {
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index b9d99dc5d42..59469dd415c 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -24,6 +24,7 @@
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/cuda_async_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/limiting_resource_adaptor.hpp>
 #include <rmm/mr/device/logging_resource_adaptor.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
@@ -368,10 +369,16 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(
             std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_limit);
       }
     } else if (use_cuda_async_alloc) {
-      auto pool_limit = (max_pool_size > 0) ? thrust::optional<std::size_t>{max_pool_size} :
-                                              thrust::optional<std::size_t>{};
-      Initialized_resource =
-          std::make_shared<rmm::mr::cuda_async_memory_resource>(pool_size, pool_limit);
+      auto const pool_limit = max_pool_size > 0 ? static_cast<std::size_t>(max_pool_size) :
+                                                  std::numeric_limits<std::size_t>::max();
+      auto const release_threshold = max_pool_size > 0 ?
+                                         thrust::optional<std::size_t>{max_pool_size} :
+                                         thrust::optional<std::size_t>{};
+      // Use `limiting_resource_adaptor` to set a hard limit on the max pool size since
+      // `cuda_async_memory_resource` only has a release threshold.
+      Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::limiting_resource_adaptor>(
+          std::make_shared<rmm::mr::cuda_async_memory_resource>(pool_size, release_threshold),
+          pool_limit);
     } else if (use_managed_mem) {
       Initialized_resource = std::make_shared<rmm::mr::managed_memory_resource>();
     } else {
diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java
index f2b1f267e78..d1273b4b7c1 100644
--- a/java/src/test/java/ai/rapids/cudf/RmmTest.java
+++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java
@@ -410,17 +410,30 @@ public void testPoolLimitNonPoolMode() {
   }
 
   @Test
-  public void testCudaAsyncMemoryResource() {
+  public void testCudaAsyncMemoryResourceLimit() {
     try {
-      Rmm.initialize(RmmAllocationMode.CUDA_ASYNC, false, 1024 * 1024L, 1024 * 1024L);
+      Rmm.initialize(RmmAllocationMode.CUDA_ASYNC, false, 1024, 2048);
     } catch (CudfException e) {
+      // CUDA 11.2 introduced cudaMallocAsync, older CUDA Toolkit will skip this test.
       assumeFalse(e.getMessage().contains("cudaMallocAsync not supported"));
       throw e;
     }
-    DeviceMemoryBuffer buff = Rmm.alloc(1024);
-    buff.close();
-    buff = Rmm.alloc(2048);
-    buff.close();
+    try (DeviceMemoryBuffer ignored1 = Rmm.alloc(512);
+         DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
+      assertThrows(OutOfMemoryError.class,
+          () -> {
+            DeviceMemoryBuffer ignored3 = Rmm.alloc(1024);
+            ignored3.close();
+          });
+    }
+  }
+
+  @Test
+  public void testCudaAsyncIsIncompatibleWithManaged() {
+    assertThrows(IllegalArgumentException.class,
+        () -> Rmm.initialize(
+            RmmAllocationMode.CUDA_ASYNC | RmmAllocationMode.CUDA_MANAGED_MEMORY,
+            false, 1024, 2048));
   }
 
   private static class AllocFailException extends RuntimeException {