Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new APIs for GPU memory monitoring (max_memory_allocated, max_memory_reserved, memory_allocated, memory_reserved) #38657

Merged
merged 16 commits into from
Mar 30, 2022
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion paddle/fluid/memory/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@ endif()
cc_library(malloc SRCS malloc.cc DEPS
place enforce allocator_facade profiler ${MKLDNN_CTX_DEPS})
cc_library(memcpy SRCS memcpy.cc DEPS place device_context)
cc_library(stats SRCS stats.cc DEPS os_info)

cc_library(memory DEPS malloc memcpy)
cc_library(memory DEPS malloc memcpy stats)

if (WITH_GPU)
nv_test(malloc_test
Expand Down
8 changes: 4 additions & 4 deletions paddle/fluid/memory/allocation/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cc_library(allocator SRCS allocator.cc DEPS place)
cc_library(allocator SRCS allocator.cc DEPS place stats)
cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
Expand All @@ -14,7 +14,7 @@ else ()
endif()

if (WITH_GPU)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats)
nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph)
Expand All @@ -27,7 +27,7 @@ if (WITH_GPU)
endif()

if (WITH_ROCM)
hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats)
hip_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
Expand Down Expand Up @@ -101,7 +101,7 @@ endif()
cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy stats)

if (WITH_GPU)
target_link_libraries(allocator_facade cuda_graph)
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/memory/allocation/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
#include <vector>

#include "paddle/fluid/framework/inlined_vector.h"
#include "paddle/fluid/memory/stats.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/core/allocator.h"

Expand Down
18 changes: 18 additions & 0 deletions paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/memory/allocation/stat_allocator.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"

Expand Down Expand Up @@ -292,6 +293,8 @@ class AllocatorFacadePrivate {
WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
}

WrapStatAllocator();

CheckAllocThreadSafe();

#ifdef PADDLE_WITH_CUDA
Expand Down Expand Up @@ -465,6 +468,7 @@ class AllocatorFacadePrivate {
InitAutoGrowthCUDAAllocator(p, stream);
WrapStreamSafeCUDAAllocator(p, stream);
WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time);
WrapStatAllocator(p, stream);
}
}

Expand Down Expand Up @@ -639,6 +643,11 @@ class AllocatorFacadePrivate {
allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
}

void WrapStatAllocator(platform::CUDAPlace p, gpuStream_t stream) {
std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
allocator = std::make_shared<StatAllocator>(allocator);
}

#ifdef PADDLE_WITH_CUDA
void WrapCUDAGraphAllocator() {
for (auto& item : allocators_) {
Expand Down Expand Up @@ -820,6 +829,15 @@ class AllocatorFacadePrivate {
}
}

void WrapStatAllocator() {
for (auto& pair : allocators_) {
// Now memory stats is only supported for GPU
if (platform::is_gpu_place(pair.first)) {
pair.second = std::make_shared<StatAllocator>(pair.second);
}
}
}

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// a standalone CUDA allocator to support multi-stream GC in new executor
CUDAAllocatorMap cuda_allocators_;
Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/memory/allocation/cuda_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h"

#include "paddle/fluid/platform/monitor.h"
DECLARE_string(memory_stats_opt);

namespace paddle {
namespace memory {
namespace allocation {
Expand Down
56 changes: 56 additions & 0 deletions paddle/fluid/memory/allocation/stat_allocator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/stats.h"

namespace paddle {
namespace memory {
namespace allocation {

class StatAllocator : public Allocator {
public:
explicit StatAllocator(std::shared_ptr<Allocator> underlying_allocator)
: underlying_allocator_(std::move(underlying_allocator)) {}

bool IsAllocThreadSafe() const override { return true; }

protected:
void FreeImpl(phi::Allocation* allocation) override {
MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-allocation->size());
underlying_allocator_->Free(allocation);
}

phi::Allocation* AllocateImpl(size_t size) override {
phi::Allocator::AllocationPtr allocation =
underlying_allocator_->Allocate(size);
MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
allocation->size());
return allocation.release();
}

uint64_t ReleaseImpl(const platform::Place& place) override {
return underlying_allocator_->Release(place);
}

private:
std::shared_ptr<Allocator> underlying_allocator_;
};

} // namespace allocation
} // namespace memory
} // namespace paddle
1 change: 1 addition & 0 deletions paddle/fluid/memory/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ limitations under the License. */

#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/stats.h"
118 changes: 118 additions & 0 deletions paddle/fluid/memory/stats.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/memory/stats.h"

#include "paddle/fluid/memory/allocation/spin_lock.h"
#include "paddle/fluid/platform/variant.h"

namespace paddle {
namespace memory {

class StatRegistry {
public:
static StatRegistry* GetInstance() {
static StatRegistry instance;
return &instance;
}

StatBase* GetStat(const std::string& stat_type, int dev_id) {
auto it = stat_map_.find(GetStatKey(stat_type, dev_id));
if (it == stat_map_.end()) {
PADDLE_THROW(platform::errors::InvalidArgument(
"The STAT type \"%s\" for device %d has not been regeistered.",
stat_type.c_str(), dev_id));
}
return it->second;
}

std::string GetStatKey(const std::string& stat_type, int dev_id) {
return "STAT_Device" + std::to_string(dev_id) + "_" + stat_type;
}

int64_t GetCurrentValue(const std::string& stat_type, int dev_id) {
return GetStat(stat_type, dev_id)->GetCurrentValue();
}

int64_t GetPeakValue(const std::string& stat_type, int dev_id) {
return GetStat(stat_type, dev_id)->GetPeakValue();
}

void Register(const std::string& stat_type, int dev_id, StatBase* stat) {
std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
stat_map_[GetStatKey(stat_type, dev_id)] = stat;
}

void Unregister(const std::string& stat_type, int dev_id) {
std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
stat_map_.erase(GetStatKey(stat_type, dev_id));
}

void Update(const std::string& stat_type, int dev_id, int64_t increment) {
stat_map_[GetStatKey(stat_type, dev_id)]->Update(increment);
}

private:
StatRegistry() = default;

DISABLE_COPY_AND_ASSIGN(StatRegistry);

std::unordered_map<std::string, StatBase*> stat_map_;
SpinLock stat_map_lock_;
};

int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id) {
return StatRegistry::GetInstance()->GetCurrentValue(stat_type, dev_id);
}

int64_t StatGetPeakValue(const std::string& stat_type, int dev_id) {
return StatRegistry::GetInstance()->GetPeakValue(stat_type, dev_id);
}

void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment) {
StatRegistry::GetInstance()->Update(stat_type, dev_id, increment);
}

#define MEMORY_STAT_REGISTER_WITH_ID(item, id) \
StatRegistry::GetInstance()->Register( \
#item, id, Stat<ThreadLocalStatDevice##id##item>::GetInstance());

#define MEMORY_STAT_REGISTER(item) \
MEMORY_STAT_REGISTER_WITH_ID(item, 0); \
MEMORY_STAT_REGISTER_WITH_ID(item, 1); \
MEMORY_STAT_REGISTER_WITH_ID(item, 2); \
MEMORY_STAT_REGISTER_WITH_ID(item, 3); \
MEMORY_STAT_REGISTER_WITH_ID(item, 4); \
MEMORY_STAT_REGISTER_WITH_ID(item, 5); \
MEMORY_STAT_REGISTER_WITH_ID(item, 6); \
MEMORY_STAT_REGISTER_WITH_ID(item, 7); \
MEMORY_STAT_REGISTER_WITH_ID(item, 8); \
MEMORY_STAT_REGISTER_WITH_ID(item, 9); \
MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
MEMORY_STAT_REGISTER_WITH_ID(item, 15)

int RegisterAllStats() {
MEMORY_STAT_REGISTER(Allocated);
MEMORY_STAT_REGISTER(Reserved);
return 0;
}

UNUSED static int regiester_all_stats = RegisterAllStats();

} // namespace memory
} // namespace paddle
Loading