From 68189b173b90d9c1926f4b81baaebda5d3a46fe0 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 3 Oct 2024 09:30:47 -0700
Subject: [PATCH 1/4] [JNI] Enables fabric handles for CUDA async memory pools

Signed-off-by: Alessandro Bellina <abellina@nvidia.com>
---
 java/src/main/java/ai/rapids/cudf/Rmm.java     | 11 +++++++----
 .../java/ai/rapids/cudf/RmmAllocationMode.java |  5 +++++
 .../cudf/RmmCudaAsyncMemoryResource.java       | 13 ++++++++++++-
 java/src/main/native/src/RmmJni.cpp            | 18 ++++++++++++++++--
 4 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index ed029c918e4..d1cc0cc96fe 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -206,7 +206,8 @@ private static void setGlobalValsFromResource(RmmDeviceMemoryResource resource)
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
    *                       {@link RmmAllocationMode#ARENA},
-   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
+   *                       {@link RmmAllocationMode#CUDA_ASYNC},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC_FABRIC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param logConf        How to do logging or null if you don't want to
    * @param poolSize       The initial pool size in bytes
@@ -221,6 +222,7 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
     boolean isPool = (allocationMode & RmmAllocationMode.POOL) != 0;
     boolean isArena = (allocationMode & RmmAllocationMode.ARENA) != 0;
     boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0;
+    boolean isAsyncFabric = (allocationMode & RmmAllocationMode.CUDA_ASYNC_FABRIC) != 0;
     boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0;
 
     if (isAsync && isManaged) {
@@ -246,6 +248,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
       } else if (isAsync) {
         resource = new RmmLimitingResourceAdaptor<>(
             new RmmCudaAsyncMemoryResource(poolSize, poolSize), poolSize, 512);
+      } else if (isAsyncFabric) {
+        resource = new RmmLimitingResourceAdaptor<>(
+            new RmmCudaAsyncMemoryResource(poolSize, poolSize, true), poolSize, 512);
       } else if (isManaged) {
         resource = new RmmManagedMemoryResource();
       } else {
@@ -521,7 +526,6 @@ public static DeviceMemoryBuffer alloc(long size, Cuda.Stream stream) {
 
   private static native long allocInternal(long size, long stream) throws RmmException;
 
-
   static native void free(long ptr, long length, long stream) throws RmmException;
 
   /**
@@ -562,7 +566,7 @@ static native long newArenaMemoryResource(long childHandle,
 
   static native void releaseArenaMemoryResource(long handle);
 
-  static native long newCudaAsyncMemoryResource(long size, long release) throws RmmException;
+  static native long newCudaAsyncMemoryResource(long size, long release, boolean fabric) throws RmmException;
 
   static native void releaseCudaAsyncMemoryResource(long handle);
 
@@ -575,7 +579,6 @@ static native long newLoggingResourceAdaptor(long handle, int type, String path,
 
   static native void releaseLoggingResourceAdaptor(long handle);
 
-
   static native long newTrackingResourceAdaptor(long handle, long alignment) throws RmmException;
 
   static native void releaseTrackingResourceAdaptor(long handle);
diff --git a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
index 966c21bee22..8fd2b04742f 100644
--- a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
+++ b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
@@ -36,4 +36,9 @@ public class RmmAllocationMode {
    * Use CUDA async suballocation strategy
    */
   public static final int CUDA_ASYNC = 0x00000008;
+  /**
+   * Use CUDA async suballocation strategy with fabric handles that are
+   * peer accessible with read-write access
+   */
+  public static final int CUDA_ASYNC_FABRIC = 0x00000010;
 }
diff --git a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
index fa1f13cb7ed..ac46da39193 100644
--- a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
+++ b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
@@ -30,9 +30,20 @@ public class RmmCudaAsyncMemoryResource implements RmmDeviceMemoryResource {
    * @param releaseThreshold size in bytes for when memory is released back to cuda
    */
   public RmmCudaAsyncMemoryResource(long size, long releaseThreshold) {
+    this(size, releaseThreshold, false);
+  }
+
+  /**
+   * Create a new async memory resource
+   * @param size the initial size of the pool
+   * @param releaseThreshold size in bytes for when memory is released back to cuda
+   * @param fabric if true request peer read+write accessible fabric handles when
+   *        creating the pool
+   */
+  public RmmCudaAsyncMemoryResource(long size, long releaseThreshold, boolean fabric) {
     this.size = size;
     this.releaseThreshold = releaseThreshold;
-    handle = Rmm.newCudaAsyncMemoryResource(size, releaseThreshold);
+    handle = Rmm.newCudaAsyncMemoryResource(size, releaseThreshold, fabric);
   }
 
   @Override
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 23c7b7fb243..94c0b809082 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -775,11 +775,25 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseArenaMemoryResource(JNIEnv
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEnv* env,
                                                                            jclass clazz,
                                                                            jlong init,
-                                                                           jlong release)
+                                                                           jlong release,
+                                                                           jboolean fabric)
 {
   try {
     cudf::jni::auto_set_device(env);
-    auto ret = new rmm::mr::cuda_async_memory_resource(init, release);
+  
+    // when we are using fabric, we need to set the memory access to be 
+    // read_write, in order for peer GPUs to have access to this memory.
+    auto [handle_type, prot_flag] = !fabric ?
+      std::pair{ 
+        rmm::mr::cuda_async_memory_resource::allocation_handle_type::none,
+        rmm::mr::cuda_async_memory_resource::access_flags::none} :
+      std::pair{
+        rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric,
+        rmm::mr::cuda_async_memory_resource::access_flags::read_write};
+
+    auto ret = new rmm::mr::cuda_async_memory_resource(
+      init, release, handle_type, prot_flag);
+
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)

From 3c17202f0e99bffb50e4babb2806184dfcc4061e Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Fri, 6 Dec 2024 16:56:34 +0000
Subject: [PATCH 2/4] Update copyrights and review comments

Signed-off-by: Alessandro Bellina <abellina@nvidia.com>
---
 .../main/java/ai/rapids/cudf/RmmAllocationMode.java    |  2 +-
 .../ai/rapids/cudf/RmmCudaAsyncMemoryResource.java     |  2 +-
 java/src/main/native/src/RmmJni.cpp                    | 10 +++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
index 8fd2b04742f..3f7bc1fae76 100644
--- a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
+++ b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
index ac46da39193..cf4936e2e24 100644
--- a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
+++ b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 94c0b809082..1d319946f78 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -783,13 +783,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEn
   
     // when we are using fabric, we need to set the memory access to be 
     // read_write, in order for peer GPUs to have access to this memory.
-    auto [handle_type, prot_flag] = !fabric ?
-      std::pair{ 
-        rmm::mr::cuda_async_memory_resource::allocation_handle_type::none,
-        rmm::mr::cuda_async_memory_resource::access_flags::none} :
+    auto [handle_type, prot_flag] = fabric ?
       std::pair{
         rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric,
-        rmm::mr::cuda_async_memory_resource::access_flags::read_write};
+        rmm::mr::cuda_async_memory_resource::access_flags::read_write} :
+      std::pair{ 
+        rmm::mr::cuda_async_memory_resource::allocation_handle_type::none,
+        rmm::mr::cuda_async_memory_resource::access_flags::none};
 
     auto ret = new rmm::mr::cuda_async_memory_resource(
       init, release, handle_type, prot_flag);

From 3f7e1bd5c5d4123ef6302c4015d20ba4bcc46237 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Sun, 8 Dec 2024 13:50:03 -0800
Subject: [PATCH 3/4] clang format

Signed-off-by: Alessandro Bellina <abellina@nvidia.com>
---
 java/src/main/native/src/RmmJni.cpp | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 1d319946f78..eac3377f1ab 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -772,27 +772,21 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseArenaMemoryResource(JNIEnv
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEnv* env,
-                                                                           jclass clazz,
-                                                                           jlong init,
-                                                                           jlong release,
-                                                                           jboolean fabric)
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(
+  JNIEnv* env, jclass clazz, jlong init, jlong release, jboolean fabric)
 {
   try {
     cudf::jni::auto_set_device(env);
-  
-    // when we are using fabric, we need to set the memory access to be 
+
+    // when we are using fabric, we need to set the memory access to be
     // read_write, in order for peer GPUs to have access to this memory.
-    auto [handle_type, prot_flag] = fabric ?
-      std::pair{
-        rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric,
-        rmm::mr::cuda_async_memory_resource::access_flags::read_write} :
-      std::pair{ 
-        rmm::mr::cuda_async_memory_resource::allocation_handle_type::none,
-        rmm::mr::cuda_async_memory_resource::access_flags::none};
-
-    auto ret = new rmm::mr::cuda_async_memory_resource(
-      init, release, handle_type, prot_flag);
+    auto [handle_type, prot_flag] =
+      fabric ? std::pair{rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric,
+                         rmm::mr::cuda_async_memory_resource::access_flags::read_write}
+             : std::pair{rmm::mr::cuda_async_memory_resource::allocation_handle_type::none,
+                         rmm::mr::cuda_async_memory_resource::access_flags::none};
+
+    auto ret = new rmm::mr::cuda_async_memory_resource(init, release, handle_type, prot_flag);
 
     return reinterpret_cast<jlong>(ret);
   }

From 8d27a927777436c2e2835bf3fce235a9acfaac80 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Mon, 9 Dec 2024 07:50:01 -0800
Subject: [PATCH 4/4] If not selecting fabric, pass nullopt to keep old CUDA
 api calls

Signed-off-by: Alessandro Bellina <abellina@nvidia.com>
---
 java/src/main/native/src/RmmJni.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index eac3377f1ab..0f424761bfe 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -778,13 +778,15 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(
   try {
     cudf::jni::auto_set_device(env);
 
-    // when we are using fabric, we need to set the memory access to be
+    // When we are using fabric, we need to set the memory access to be
     // read_write, in order for peer GPUs to have access to this memory.
+    // Otherwise, choose default parameters (optional set to nullopt).
     auto [handle_type, prot_flag] =
-      fabric ? std::pair{rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric,
-                         rmm::mr::cuda_async_memory_resource::access_flags::read_write}
-             : std::pair{rmm::mr::cuda_async_memory_resource::allocation_handle_type::none,
-                         rmm::mr::cuda_async_memory_resource::access_flags::none};
+      fabric
+        ? std::pair{std::optional{
+                      rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric},
+                    std::optional{rmm::mr::cuda_async_memory_resource::access_flags::read_write}}
+        : std::pair{std::nullopt, std::nullopt};
 
     auto ret = new rmm::mr::cuda_async_memory_resource(init, release, handle_type, prot_flag);