rapidsai · rapids-bot · Sep 13, 2021 · Sep 9, 2021 · Sep 9, 2021 · Sep 9, 2021
@@ -85,7 +85,8 @@ public static LogConf logToStderr() {
    * @param allocationMode Allocation strategy to use. Bit set using
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA} and
+   *                       {@link RmmAllocationMode#ARENA},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param enableLogging  Enable logging memory manager events
    * @param poolSize       The initial pool size in bytes
@@ -106,7 +107,8 @@ public static void initialize(int allocationMode, boolean enableLogging, long po
    * @param allocationMode Allocation strategy to use. Bit set using
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA} and
+   *                       {@link RmmAllocationMode#ARENA},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param enableLogging  Enable logging memory manager events
    * @param poolSize       The initial pool size in bytes
@@ -138,7 +140,8 @@ public static void initialize(int allocationMode, boolean enableLogging, long po
    * @param allocationMode Allocation strategy to use. Bit set using
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA} and
+   *                       {@link RmmAllocationMode#ARENA},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param logConf        How to do logging or null if you don't want to
    * @param poolSize       The initial pool size in bytes
@@ -159,7 +162,8 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
    * @param allocationMode Allocation strategy to use. Bit set using
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA} and
+   *                       {@link RmmAllocationMode#ARENA},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param logConf        How to do logging or null if you don't want to
    * @param poolSize       The initial pool size in bytes
@@ -168,8 +172,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
    * @throws IllegalStateException if RMM has already been initialized
    * @throws IllegalArgumentException if a max pool size is specified but the allocation mode
    *                                  is not {@link RmmAllocationMode#POOL} or
-   *                                  {@link RmmAllocationMode#ARENA}, or the maximum pool size is
-   *                                  below the initial size.
+   *                                  {@link RmmAllocationMode#ARENA} or
+   *                                  {@link RmmAllocationMode#CUDA_ASYNC}, or the maximum pool
+   *                                  size is below the initial size.
    */
   public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize,
       long maxPoolSize) throws RmmException {
@@ -186,7 +191,8 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
    * @param allocationMode Allocation strategy to use. Bit set using
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA} and
+   *                       {@link RmmAllocationMode#ARENA},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param logConf        How to do logging or null if you don't want to
    * @param poolSize       The initial pool size in bytes
@@ -198,24 +204,35 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
    * @throws IllegalStateException if RMM has already been initialized
    * @throws IllegalArgumentException if a max pool size is specified but the allocation mode
    *                                  is not {@link RmmAllocationMode#POOL} or
-   *                                  {@link RmmAllocationMode#ARENA}, or the maximum pool size is
-   *                                  below the initial size.
+   *                                  {@link RmmAllocationMode#ARENA} or
+   *                                  {@link RmmAllocationMode#CUDA_ASYNC}, or the maximum pool
+   *                                  size is below the initial size.
    */
   public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize,
       long maxPoolSize, long allocationAlignment, long alignmentThreshold) throws RmmException {
     if (initialized) {
       throw new IllegalStateException("RMM is already initialized");
     }
+
+    boolean isPool = (allocationMode & RmmAllocationMode.POOL) != 0;
+    boolean isArena = (allocationMode & RmmAllocationMode.ARENA) != 0;
+    boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0;
+    boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0;
+
     if (maxPoolSize > 0) {
-      if (allocationMode != RmmAllocationMode.POOL && allocationMode != RmmAllocationMode.ARENA) {
+      if (!isPool && !isArena && !isAsync) {
         throw new IllegalArgumentException(
-                "Pool limit only supported in POOL or ARENA allocation mode");
+            "Pool limit only supported in POOL, ARENA, or CUDA_ASYNC allocation mode");
       }
       if (maxPoolSize < poolSize) {
         throw new IllegalArgumentException("Pool limit of " + maxPoolSize
             + " is less than initial pool size of " + poolSize);
       }
     }
+    if (isAsync && isManaged) {
+      throw new IllegalArgumentException(
+          "CUDA Unified Memory is not supported in CUDA_ASYNC allocation mode");
+    }
     LogLoc loc = LogLoc.NONE;
     String path = null;
     if (logConf != null) {

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,4 +32,8 @@ public class RmmAllocationMode {
    * Use arena suballocation strategy
    */
   public static final int ARENA = 0x00000004;
+  /**
+   * Use CUDA async suballocation strategy
+   */
+  public static final int CUDA_ASYNC = 0x00000008;
 }
@@ -22,7 +22,9 @@
 
 #include <rmm/mr/device/aligned_resource_adaptor.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/limiting_resource_adaptor.hpp>
 #include <rmm/mr/device/logging_resource_adaptor.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
@@ -344,6 +346,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(
     bool use_pool_alloc = allocation_mode & 1;
     bool use_managed_mem = allocation_mode & 2;
     bool use_arena_alloc = allocation_mode & 4;
+    bool use_cuda_async_alloc = allocation_mode & 8;
     if (use_pool_alloc) {
       auto pool_limit = (max_pool_size > 0) ?
                             thrust::optional<std::size_t>{static_cast<std::size_t>(max_pool_size)} :
@@ -365,6 +368,17 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
             std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_limit);
       }
+    } else if (use_cuda_async_alloc) {
+      auto const pool_limit = max_pool_size > 0 ? static_cast<std::size_t>(max_pool_size) :
+                                                  std::numeric_limits<std::size_t>::max();
+      auto const release_threshold = max_pool_size > 0 ?
+                                         thrust::optional<std::size_t>{max_pool_size} :
+                                         thrust::optional<std::size_t>{};
+      // Use `limiting_resource_adaptor` to set a hard limit on the max pool size since
+      // `cuda_async_memory_resource` only has a release threshold.
+      Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::limiting_resource_adaptor>(
+          std::make_shared<rmm::mr::cuda_async_memory_resource>(pool_size, release_threshold),
+          pool_limit);
     } else if (use_managed_mem) {
       Initialized_resource = std::make_shared<rmm::mr::managed_memory_resource>();
     } else {

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@
 import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assertions.fail;
+import static org.junit.jupiter.api.Assumptions.assumeFalse;
 
 public class RmmTest {
   private static final long TOO_MUCH_MEMORY = 3L * 1024 * 1024 * 1024 * 1024 * 1024 * 1024;
@@ -51,18 +52,24 @@ public void teardown() {
   }
 
   @ParameterizedTest
-  @ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL})
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
   public void testTotalAllocated(int rmmAllocMode) {
     Rmm.initialize(rmmAllocMode, false, 512 * 1024 * 1024);
     assertEquals(0, Rmm.getTotalBytesAllocated());
-    try (DeviceMemoryBuffer addr = Rmm.alloc(1024)) {
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) {
       assertEquals(1024, Rmm.getTotalBytesAllocated());
     }
     assertEquals(0, Rmm.getTotalBytesAllocated());
   }
 
   @ParameterizedTest
-  @ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL})
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
   public void testEventHandler(int rmmAllocMode) {
     AtomicInteger invokedCount = new AtomicInteger();
     AtomicLong amountRequested = new AtomicLong();
@@ -328,7 +335,7 @@ public void onDeallocThreshold(long totalAllocSize) {
 
     Rmm.setEventHandler(handler);
     DeviceMemoryBuffer addr = Rmm.alloc(6 * 1024);
-    assertThrows(DeallocThresholdException.class, () -> addr.close());
+    assertThrows(DeallocThresholdException.class, addr::close);
     assertThrows(AllocThresholdException.class, () -> Rmm.alloc(12 * 1024));
     assertThrows(AllocFailException.class, () -> Rmm.alloc(TOO_MUCH_MEMORY));
   }
@@ -356,7 +363,10 @@ public void testThreadAutoDeviceSetup() throws Exception {
   }
 
   @ParameterizedTest
-  @ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL})
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
   public void testSetDeviceThrowsAfterRmmInit(int rmmAllocMode) {
     Rmm.initialize(rmmAllocMode, false, 1024 * 1024);
     assertThrows(CudfException.class, () -> Cuda.setDevice(Cuda.getDevice() + 1));
@@ -399,9 +409,41 @@ public void testPoolLimitNonPoolMode() {
         () -> Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, false, 1024, 2048));
   }
 
-  private static class AllocFailException extends RuntimeException {}
-  private static class AllocThresholdException extends RuntimeException {}
-  private static class DeallocThresholdException extends RuntimeException {}
+  @Test
+  public void testCudaAsyncMemoryResourceLimit() {
+    try {
+      Rmm.initialize(RmmAllocationMode.CUDA_ASYNC, false, 1024, 2048);
+    } catch (CudfException e) {
+      // CUDA 11.2 introduced cudaMallocAsync, older CUDA Toolkit will skip this test.
+      assumeFalse(e.getMessage().contains("cudaMallocAsync not supported"));
+      throw e;
+    }
+    try (DeviceMemoryBuffer ignored1 = Rmm.alloc(512);
+         DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
+      assertThrows(OutOfMemoryError.class,
+          () -> {
+            DeviceMemoryBuffer ignored3 = Rmm.alloc(1024);
+            ignored3.close();
+          });
+    }
+  }
+
+  @Test
+  public void testCudaAsyncIsIncompatibleWithManaged() {
+    assertThrows(IllegalArgumentException.class,
+        () -> Rmm.initialize(
+            RmmAllocationMode.CUDA_ASYNC | RmmAllocationMode.CUDA_MANAGED_MEMORY,
+            false, 1024, 2048));
+  }
+
+  private static class AllocFailException extends RuntimeException {
+  }
+
+  private static class AllocThresholdException extends RuntimeException {
+  }
+
+  private static class DeallocThresholdException extends RuntimeException {
+  }
 
   private static abstract class BaseRmmEventHandler implements RmmEventHandler {
     @Override