Skip to content

Commit

Permalink
fix max bytes dealloc bug
Browse files Browse the repository at this point in the history
  • Loading branch information
zpuller committed Oct 28, 2024
1 parent 156ad0c commit c65aeb0
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 2 deletions.
4 changes: 3 additions & 1 deletion src/main/cpp/src/SparkResourceAdaptorJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1780,6 +1780,9 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
auto const thread = threads.find(tid);
if (thread != threads.end()) {
log_status("DEALLOC", tid, thread->second.task_id, thread->second.state);
if (!is_for_cpu) {
thread->second.gpu_memory_allocated_bytes -= num_bytes;
}
} else {
log_status("DEALLOC", tid, -2, thread_state::UNKNOWN);
}
Expand All @@ -1802,7 +1805,6 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
if (is_for_cpu == t_state.is_cpu_alloc) {
transition(t_state, thread_state::THREAD_ALLOC_FREE);
}
if (!is_for_cpu) { t_state.gpu_memory_allocated_bytes -= num_bytes; }
break;
default: break;
}
Expand Down
7 changes: 6 additions & 1 deletion src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ public void testInsertOOMsGpu() {
assertThrows(GpuSplitAndRetryOOM.class, () -> Rmm.alloc(100).close());
assertEquals(0, RmmSpark.getAndResetNumRetryThrow(taskid));
assertEquals(1, RmmSpark.getAndResetNumSplitRetryThrow(taskid));
assertEquals(ALIGNMENT * 2, RmmSpark.getAndResetGpuMaxMemoryAllocated(taskid));
assertEquals(ALIGNMENT, RmmSpark.getAndResetGpuMaxMemoryAllocated(taskid));

// Verify that injecting OOM does not cause the block to actually happen
assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
Expand Down Expand Up @@ -818,6 +818,11 @@ public void testBasicMixedBlocking() throws ExecutionException, InterruptedExcep
secondGpuAlloc.waitForAlloc();
secondGpuAlloc.freeAndWait();
}
// Do one more alloc after freeing on same task to show the max allocation metric is unimpacted
try (AllocOnAnotherThread secondGpuAlloc = new GpuAllocOnAnotherThread(taskThree, FIVE_MB)) {
secondGpuAlloc.waitForAlloc();
secondGpuAlloc.freeAndWait();
}
}
}
} finally {
Expand Down

0 comments on commit c65aeb0

Please sign in to comment.