From 68189b173b90d9c1926f4b81baaebda5d3a46fe0 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 3 Oct 2024 09:30:47 -0700 Subject: [PATCH 1/4] [JNI] Enables fabric handles for CUDA async memory pools Signed-off-by: Alessandro Bellina --- java/src/main/java/ai/rapids/cudf/Rmm.java | 11 +++++++---- .../java/ai/rapids/cudf/RmmAllocationMode.java | 5 +++++ .../cudf/RmmCudaAsyncMemoryResource.java | 13 ++++++++++++- java/src/main/native/src/RmmJni.cpp | 18 ++++++++++++++++-- 4 files changed, 40 insertions(+), 7 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index ed029c918e4..d1cc0cc96fe 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -206,7 +206,8 @@ private static void setGlobalValsFromResource(RmmDeviceMemoryResource resource) * {@link RmmAllocationMode#CUDA_DEFAULT}, * {@link RmmAllocationMode#POOL}, * {@link RmmAllocationMode#ARENA}, - * {@link RmmAllocationMode#CUDA_ASYNC} and + * {@link RmmAllocationMode#CUDA_ASYNC}, + * {@link RmmAllocationMode#CUDA_ASYNC_FABRIC} and * {@link RmmAllocationMode#CUDA_MANAGED_MEMORY} * @param logConf How to do logging or null if you don't want to * @param poolSize The initial pool size in bytes @@ -221,6 +222,7 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, boolean isPool = (allocationMode & RmmAllocationMode.POOL) != 0; boolean isArena = (allocationMode & RmmAllocationMode.ARENA) != 0; boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0; + boolean isAsyncFabric = (allocationMode & RmmAllocationMode.CUDA_ASYNC_FABRIC) != 0; boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0; if (isAsync && isManaged) { @@ -246,6 +248,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, } else if (isAsync) { resource = new RmmLimitingResourceAdaptor<>( new RmmCudaAsyncMemoryResource(poolSize, poolSize), poolSize, 512); + } else if (isAsyncFabric) { + resource = new RmmLimitingResourceAdaptor<>( + new RmmCudaAsyncMemoryResource(poolSize, poolSize, true), poolSize, 512); } else if (isManaged) { resource = new RmmManagedMemoryResource(); } else { @@ -521,7 +526,6 @@ public static DeviceMemoryBuffer alloc(long size, Cuda.Stream stream) { private static native long allocInternal(long size, long stream) throws RmmException; - static native void free(long ptr, long length, long stream) throws RmmException; /** @@ -562,7 +566,7 @@ static native long newArenaMemoryResource(long childHandle, static native void releaseArenaMemoryResource(long handle); - static native long newCudaAsyncMemoryResource(long size, long release) throws RmmException; + static native long newCudaAsyncMemoryResource(long size, long release, boolean fabric) throws RmmException; static native void releaseCudaAsyncMemoryResource(long handle); @@ -575,7 +579,6 @@ static native long newLoggingResourceAdaptor(long handle, int type, String path, static native void releaseLoggingResourceAdaptor(long handle); - static native long newTrackingResourceAdaptor(long handle, long alignment) throws RmmException; static native void releaseTrackingResourceAdaptor(long handle); diff --git a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java index 966c21bee22..8fd2b04742f 100644 --- a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java +++ b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java @@ -36,4 +36,9 @@ public class RmmAllocationMode { * Use CUDA async suballocation strategy */ public static final int CUDA_ASYNC = 0x00000008; + /** + * Use CUDA async suballocation strategy with fabric handles that are + * peer accessible with read-write access + */ + public static final int CUDA_ASYNC_FABRIC = 0x00000010; } diff --git a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java index fa1f13cb7ed..ac46da39193 100644 --- a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java +++ b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java @@ -30,9 +30,20 @@ public class RmmCudaAsyncMemoryResource implements RmmDeviceMemoryResource { * @param releaseThreshold size in bytes for when memory is released back to cuda */ public RmmCudaAsyncMemoryResource(long size, long releaseThreshold) { + this(size, releaseThreshold, false); + } + + /** + * Create a new async memory resource + * @param size the initial size of the pool + * @param releaseThreshold size in bytes for when memory is released back to cuda + * @param fabric if true request peer read+write accessible fabric handles when + * creating the pool + */ + public RmmCudaAsyncMemoryResource(long size, long releaseThreshold, boolean fabric) { this.size = size; this.releaseThreshold = releaseThreshold; - handle = Rmm.newCudaAsyncMemoryResource(size, releaseThreshold); + handle = Rmm.newCudaAsyncMemoryResource(size, releaseThreshold, fabric); } @Override diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 23c7b7fb243..94c0b809082 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -775,11 +775,25 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseArenaMemoryResource(JNIEnv JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEnv* env, jclass clazz, jlong init, - jlong release) + jlong release, + jboolean fabric) { try { cudf::jni::auto_set_device(env); - auto ret = new rmm::mr::cuda_async_memory_resource(init, release); + + // when we are using fabric, we need to set the memory access to be + // read_write, in order for peer GPUs to have access to this memory. + auto [handle_type, prot_flag] = !fabric ? + std::pair{ + rmm::mr::cuda_async_memory_resource::allocation_handle_type::none, + rmm::mr::cuda_async_memory_resource::access_flags::none} : + std::pair{ + rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric, + rmm::mr::cuda_async_memory_resource::access_flags::read_write}; + + auto ret = new rmm::mr::cuda_async_memory_resource( + init, release, handle_type, prot_flag); + return reinterpret_cast(ret); } CATCH_STD(env, 0) From 3c17202f0e99bffb50e4babb2806184dfcc4061e Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Fri, 6 Dec 2024 16:56:34 +0000 Subject: [PATCH 2/4] Update copyrights and review comments Signed-off-by: Alessandro Bellina --- .../main/java/ai/rapids/cudf/RmmAllocationMode.java | 2 +- .../ai/rapids/cudf/RmmCudaAsyncMemoryResource.java | 2 +- java/src/main/native/src/RmmJni.cpp | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java index 8fd2b04742f..3f7bc1fae76 100644 --- a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java +++ b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java index ac46da39193..cf4936e2e24 100644 --- a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java +++ b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 94c0b809082..1d319946f78 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -783,13 +783,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEn // when we are using fabric, we need to set the memory access to be // read_write, in order for peer GPUs to have access to this memory. - auto [handle_type, prot_flag] = !fabric ? - std::pair{ - rmm::mr::cuda_async_memory_resource::allocation_handle_type::none, - rmm::mr::cuda_async_memory_resource::access_flags::none} : + auto [handle_type, prot_flag] = fabric ? std::pair{ rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric, - rmm::mr::cuda_async_memory_resource::access_flags::read_write}; + rmm::mr::cuda_async_memory_resource::access_flags::read_write} : + std::pair{ + rmm::mr::cuda_async_memory_resource::allocation_handle_type::none, + rmm::mr::cuda_async_memory_resource::access_flags::none}; auto ret = new rmm::mr::cuda_async_memory_resource( init, release, handle_type, prot_flag); From 3f7e1bd5c5d4123ef6302c4015d20ba4bcc46237 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Sun, 8 Dec 2024 13:50:03 -0800 Subject: [PATCH 3/4] clang format Signed-off-by: Alessandro Bellina --- java/src/main/native/src/RmmJni.cpp | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 1d319946f78..eac3377f1ab 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -772,27 +772,21 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseArenaMemoryResource(JNIEnv CATCH_STD(env, ) } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEnv* env, - jclass clazz, - jlong init, - jlong release, - jboolean fabric) +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource( + JNIEnv* env, jclass clazz, jlong init, jlong release, jboolean fabric) { try { cudf::jni::auto_set_device(env); - - // when we are using fabric, we need to set the memory access to be + + // when we are using fabric, we need to set the memory access to be // read_write, in order for peer GPUs to have access to this memory. - auto [handle_type, prot_flag] = fabric ? - std::pair{ - rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric, - rmm::mr::cuda_async_memory_resource::access_flags::read_write} : - std::pair{ - rmm::mr::cuda_async_memory_resource::allocation_handle_type::none, - rmm::mr::cuda_async_memory_resource::access_flags::none}; - - auto ret = new rmm::mr::cuda_async_memory_resource( - init, release, handle_type, prot_flag); + auto [handle_type, prot_flag] = + fabric ? std::pair{rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric, + rmm::mr::cuda_async_memory_resource::access_flags::read_write} + : std::pair{rmm::mr::cuda_async_memory_resource::allocation_handle_type::none, + rmm::mr::cuda_async_memory_resource::access_flags::none}; + + auto ret = new rmm::mr::cuda_async_memory_resource(init, release, handle_type, prot_flag); return reinterpret_cast(ret); } From 8d27a927777436c2e2835bf3fce235a9acfaac80 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Mon, 9 Dec 2024 07:50:01 -0800 Subject: [PATCH 4/4] If not selecting fabric, pass nullopt to keep old CUDA api calls Signed-off-by: Alessandro Bellina --- java/src/main/native/src/RmmJni.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index eac3377f1ab..0f424761bfe 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -778,13 +778,15 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource( try { cudf::jni::auto_set_device(env); - // when we are using fabric, we need to set the memory access to be + // When we are using fabric, we need to set the memory access to be // read_write, in order for peer GPUs to have access to this memory. + // Otherwise, choose default parameters (optional set to nullopt). auto [handle_type, prot_flag] = - fabric ? std::pair{rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric, - rmm::mr::cuda_async_memory_resource::access_flags::read_write} - : std::pair{rmm::mr::cuda_async_memory_resource::allocation_handle_type::none, - rmm::mr::cuda_async_memory_resource::access_flags::none}; + fabric + ? std::pair{std::optional{ + rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric}, + std::optional{rmm::mr::cuda_async_memory_resource::access_flags::read_write}} + : std::pair{std::nullopt, std::nullopt}; auto ret = new rmm::mr::cuda_async_memory_resource(init, release, handle_type, prot_flag);