Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support CUDA async memory resource in JNI #9201

Merged
merged 5 commits into from
Sep 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 28 additions & 11 deletions java/src/main/java/ai/rapids/cudf/Rmm.java
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ public static LogConf logToStderr() {
* @param allocationMode Allocation strategy to use. Bit set using
* {@link RmmAllocationMode#CUDA_DEFAULT},
* {@link RmmAllocationMode#POOL},
* {@link RmmAllocationMode#ARENA} and
* {@link RmmAllocationMode#ARENA},
* {@link RmmAllocationMode#CUDA_ASYNC} and
* {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
* @param enableLogging Enable logging memory manager events
* @param poolSize The initial pool size in bytes
Expand All @@ -106,7 +107,8 @@ public static void initialize(int allocationMode, boolean enableLogging, long po
* @param allocationMode Allocation strategy to use. Bit set using
* {@link RmmAllocationMode#CUDA_DEFAULT},
* {@link RmmAllocationMode#POOL},
* {@link RmmAllocationMode#ARENA} and
* {@link RmmAllocationMode#ARENA},
* {@link RmmAllocationMode#CUDA_ASYNC} and
* {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
* @param enableLogging Enable logging memory manager events
* @param poolSize The initial pool size in bytes
Expand Down Expand Up @@ -138,7 +140,8 @@ public static void initialize(int allocationMode, boolean enableLogging, long po
* @param allocationMode Allocation strategy to use. Bit set using
* {@link RmmAllocationMode#CUDA_DEFAULT},
* {@link RmmAllocationMode#POOL},
* {@link RmmAllocationMode#ARENA} and
* {@link RmmAllocationMode#ARENA},
* {@link RmmAllocationMode#CUDA_ASYNC} and
* {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
* @param logConf How to do logging or null if you don't want to
* @param poolSize The initial pool size in bytes
Expand All @@ -159,7 +162,8 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
* @param allocationMode Allocation strategy to use. Bit set using
* {@link RmmAllocationMode#CUDA_DEFAULT},
* {@link RmmAllocationMode#POOL},
* {@link RmmAllocationMode#ARENA} and
* {@link RmmAllocationMode#ARENA},
* {@link RmmAllocationMode#CUDA_ASYNC} and
* {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
* @param logConf How to do logging or null if you don't want to
* @param poolSize The initial pool size in bytes
Expand All @@ -168,8 +172,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
* @throws IllegalStateException if RMM has already been initialized
* @throws IllegalArgumentException if a max pool size is specified but the allocation mode
* is not {@link RmmAllocationMode#POOL} or
* {@link RmmAllocationMode#ARENA}, or the maximum pool size is
* below the initial size.
* {@link RmmAllocationMode#ARENA} or
* {@link RmmAllocationMode#CUDA_ASYNC}, or the maximum pool
* size is below the initial size.
*/
public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize,
long maxPoolSize) throws RmmException {
Expand All @@ -186,7 +191,8 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
* @param allocationMode Allocation strategy to use. Bit set using
* {@link RmmAllocationMode#CUDA_DEFAULT},
* {@link RmmAllocationMode#POOL},
* {@link RmmAllocationMode#ARENA} and
* {@link RmmAllocationMode#ARENA},
* {@link RmmAllocationMode#CUDA_ASYNC} and
* {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
* @param logConf How to do logging or null if you don't want to
* @param poolSize The initial pool size in bytes
Expand All @@ -198,24 +204,35 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
* @throws IllegalStateException if RMM has already been initialized
* @throws IllegalArgumentException if a max pool size is specified but the allocation mode
* is not {@link RmmAllocationMode#POOL} or
* {@link RmmAllocationMode#ARENA}, or the maximum pool size is
* below the initial size.
* {@link RmmAllocationMode#ARENA} or
* {@link RmmAllocationMode#CUDA_ASYNC}, or the maximum pool
* size is below the initial size.
*/
public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize,
long maxPoolSize, long allocationAlignment, long alignmentThreshold) throws RmmException {
if (initialized) {
throw new IllegalStateException("RMM is already initialized");
}

boolean isPool = (allocationMode & RmmAllocationMode.POOL) != 0;
boolean isArena = (allocationMode & RmmAllocationMode.ARENA) != 0;
boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0;
boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0;

if (maxPoolSize > 0) {
if (allocationMode != RmmAllocationMode.POOL && allocationMode != RmmAllocationMode.ARENA) {
if (!isPool && !isArena && !isAsync) {
throw new IllegalArgumentException(
"Pool limit only supported in POOL or ARENA allocation mode");
"Pool limit only supported in POOL, ARENA, or CUDA_ASYNC allocation mode");
}
if (maxPoolSize < poolSize) {
throw new IllegalArgumentException("Pool limit of " + maxPoolSize
+ " is less than initial pool size of " + poolSize);
}
}
if (isAsync && isManaged) {
throw new IllegalArgumentException(
"CUDA Unified Memory is not supported in CUDA_ASYNC allocation mode");
}
LogLoc loc = LogLoc.NONE;
String path = null;
if (logConf != null) {
Expand Down
6 changes: 5 additions & 1 deletion java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -32,4 +32,8 @@ public class RmmAllocationMode {
* Use arena suballocation strategy
*/
public static final int ARENA = 0x00000004;
/**
* Use CUDA async suballocation strategy
*/
public static final int CUDA_ASYNC = 0x00000008;
}
14 changes: 14 additions & 0 deletions java/src/main/native/src/RmmJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@

#include <rmm/mr/device/aligned_resource_adaptor.hpp>
#include <rmm/mr/device/arena_memory_resource.hpp>
#include <rmm/mr/device/cuda_async_memory_resource.hpp>
#include <rmm/mr/device/cuda_memory_resource.hpp>
#include <rmm/mr/device/limiting_resource_adaptor.hpp>
#include <rmm/mr/device/logging_resource_adaptor.hpp>
#include <rmm/mr/device/managed_memory_resource.hpp>
#include <rmm/mr/device/owning_wrapper.hpp>
Expand Down Expand Up @@ -344,6 +346,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(
bool use_pool_alloc = allocation_mode & 1;
bool use_managed_mem = allocation_mode & 2;
bool use_arena_alloc = allocation_mode & 4;
bool use_cuda_async_alloc = allocation_mode & 8;
if (use_pool_alloc) {
auto pool_limit = (max_pool_size > 0) ?
thrust::optional<std::size_t>{static_cast<std::size_t>(max_pool_size)} :
Expand All @@ -365,6 +368,17 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(
Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_limit);
}
} else if (use_cuda_async_alloc) {
auto const pool_limit = max_pool_size > 0 ? static_cast<std::size_t>(max_pool_size) :
std::numeric_limits<std::size_t>::max();
auto const release_threshold = max_pool_size > 0 ?
thrust::optional<std::size_t>{max_pool_size} :
thrust::optional<std::size_t>{};
// Use `limiting_resource_adaptor` to set a hard limit on the max pool size since
// `cuda_async_memory_resource` only has a release threshold.
Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::limiting_resource_adaptor>(
std::make_shared<rmm::mr::cuda_async_memory_resource>(pool_size, release_threshold),
pool_limit);
} else if (use_managed_mem) {
Initialized_resource = std::make_shared<rmm::mr::managed_memory_resource>();
} else {
Expand Down
60 changes: 51 additions & 9 deletions java/src/test/java/ai/rapids/cudf/RmmTest.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -32,6 +32,7 @@
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
import static org.junit.jupiter.api.Assumptions.assumeFalse;

public class RmmTest {
private static final long TOO_MUCH_MEMORY = 3L * 1024 * 1024 * 1024 * 1024 * 1024 * 1024;
Expand All @@ -51,18 +52,24 @@ public void teardown() {
}

@ParameterizedTest
@ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL})
@ValueSource(ints = {
RmmAllocationMode.CUDA_DEFAULT,
RmmAllocationMode.POOL,
RmmAllocationMode.ARENA})
public void testTotalAllocated(int rmmAllocMode) {
Rmm.initialize(rmmAllocMode, false, 512 * 1024 * 1024);
assertEquals(0, Rmm.getTotalBytesAllocated());
try (DeviceMemoryBuffer addr = Rmm.alloc(1024)) {
try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) {
assertEquals(1024, Rmm.getTotalBytesAllocated());
}
assertEquals(0, Rmm.getTotalBytesAllocated());
}

@ParameterizedTest
@ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL})
@ValueSource(ints = {
RmmAllocationMode.CUDA_DEFAULT,
RmmAllocationMode.POOL,
RmmAllocationMode.ARENA})
public void testEventHandler(int rmmAllocMode) {
AtomicInteger invokedCount = new AtomicInteger();
AtomicLong amountRequested = new AtomicLong();
Expand Down Expand Up @@ -328,7 +335,7 @@ public void onDeallocThreshold(long totalAllocSize) {

Rmm.setEventHandler(handler);
DeviceMemoryBuffer addr = Rmm.alloc(6 * 1024);
assertThrows(DeallocThresholdException.class, () -> addr.close());
assertThrows(DeallocThresholdException.class, addr::close);
assertThrows(AllocThresholdException.class, () -> Rmm.alloc(12 * 1024));
assertThrows(AllocFailException.class, () -> Rmm.alloc(TOO_MUCH_MEMORY));
}
Expand Down Expand Up @@ -356,7 +363,10 @@ public void testThreadAutoDeviceSetup() throws Exception {
}

@ParameterizedTest
@ValueSource(ints = {RmmAllocationMode.CUDA_DEFAULT, RmmAllocationMode.POOL})
@ValueSource(ints = {
RmmAllocationMode.CUDA_DEFAULT,
RmmAllocationMode.POOL,
RmmAllocationMode.ARENA})
public void testSetDeviceThrowsAfterRmmInit(int rmmAllocMode) {
Rmm.initialize(rmmAllocMode, false, 1024 * 1024);
assertThrows(CudfException.class, () -> Cuda.setDevice(Cuda.getDevice() + 1));
Expand Down Expand Up @@ -399,9 +409,41 @@ public void testPoolLimitNonPoolMode() {
() -> Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, false, 1024, 2048));
}

private static class AllocFailException extends RuntimeException {}
private static class AllocThresholdException extends RuntimeException {}
private static class DeallocThresholdException extends RuntimeException {}
@Test
public void testCudaAsyncMemoryResourceLimit() {
try {
Rmm.initialize(RmmAllocationMode.CUDA_ASYNC, false, 1024, 2048);
} catch (CudfException e) {
// CUDA 11.2 introduced cudaMallocAsync, older CUDA Toolkit will skip this test.
assumeFalse(e.getMessage().contains("cudaMallocAsync not supported"));
throw e;
}
try (DeviceMemoryBuffer ignored1 = Rmm.alloc(512);
DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
assertThrows(OutOfMemoryError.class,
() -> {
DeviceMemoryBuffer ignored3 = Rmm.alloc(1024);
ignored3.close();
});
}
}

@Test
public void testCudaAsyncIsIncompatibleWithManaged() {
assertThrows(IllegalArgumentException.class,
() -> Rmm.initialize(
RmmAllocationMode.CUDA_ASYNC | RmmAllocationMode.CUDA_MANAGED_MEMORY,
false, 1024, 2048));
}

private static class AllocFailException extends RuntimeException {
}

private static class AllocThresholdException extends RuntimeException {
}

private static class DeallocThresholdException extends RuntimeException {
}

private static abstract class BaseRmmEventHandler implements RmmEventHandler {
@Override
Expand Down