PaddlePaddle · From00 · Mar 30, 2022 · Jan 1, 2022 · Jan 17, 2022 · Jan 17, 2022
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
@@ -10,8 +10,9 @@ endif()
 cc_library(malloc SRCS malloc.cc DEPS
     place enforce allocator_facade profiler ${MKLDNN_CTX_DEPS})
 cc_library(memcpy SRCS memcpy.cc DEPS place device_context)
+cc_library(stats SRCS stats.cc DEPS os_info)
 
-cc_library(memory DEPS malloc memcpy)
+cc_library(memory DEPS malloc memcpy stats)
 
 if (WITH_GPU)
     nv_test(malloc_test

diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(allocator SRCS allocator.cc DEPS place)
+cc_library(allocator SRCS allocator.cc DEPS place stats)
 cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
@@ -14,7 +14,7 @@ else ()
 endif()
 
 if (WITH_GPU)
-  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
+  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats)
   nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
   nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
   nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph)
@@ -27,7 +27,7 @@ if (WITH_GPU)
 endif()
 
 if (WITH_ROCM)
-  hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
+  hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats)
   hip_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
   hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
   hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
@@ -101,7 +101,7 @@ endif()
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
 cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
+cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy stats)
 
 if (WITH_GPU)
   target_link_libraries(allocator_facade cuda_graph)

diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
@@ -20,7 +20,9 @@
 #include <vector>
 
 #include "paddle/fluid/framework/inlined_vector.h"
+#include "paddle/fluid/memory/stats.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/monitor.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/allocator.h"
 

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
+#include "paddle/fluid/memory/allocation/stat_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -292,6 +293,8 @@ class AllocatorFacadePrivate {
       WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
     }
 
+    WrapStatAllocator();
+
     CheckAllocThreadSafe();
 
 #ifdef PADDLE_WITH_CUDA
@@ -465,6 +468,7 @@ class AllocatorFacadePrivate {
       InitAutoGrowthCUDAAllocator(p, stream);
       WrapStreamSafeCUDAAllocator(p, stream);
       WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time);
+      WrapStatAllocator(p, stream);
     }
   }
 
@@ -639,6 +643,11 @@ class AllocatorFacadePrivate {
     allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
   }
 
+  void WrapStatAllocator(platform::CUDAPlace p, gpuStream_t stream) {
+    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
+    allocator = std::make_shared<StatAllocator>(allocator);
+  }
+
 #ifdef PADDLE_WITH_CUDA
   void WrapCUDAGraphAllocator() {
     for (auto& item : allocators_) {
@@ -820,6 +829,15 @@ class AllocatorFacadePrivate {
     }
   }
 
+  void WrapStatAllocator() {
+    for (auto& pair : allocators_) {
+      // Now memory stats is only supported for GPU
+      if (platform::is_gpu_place(pair.first)) {
+        pair.second = std::make_shared<StatAllocator>(pair.second);
+      }
+    }
+  }
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // a standalone CUDA allocator to support multi-stream GC in new executor
   CUDAAllocatorMap cuda_allocators_;

diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -28,6 +28,9 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 
+#include "paddle/fluid/platform/monitor.h"
+DECLARE_string(memory_stats_opt);
+
 namespace paddle {
 namespace memory {
 namespace allocation {

diff --git a/paddle/fluid/memory/allocation/stat_allocator.h b/paddle/fluid/memory/allocation/stat_allocator.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/stats.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class StatAllocator : public Allocator {
+ public:
+  explicit StatAllocator(std::shared_ptr<Allocator> underlying_allocator)
+      : underlying_allocator_(std::move(underlying_allocator)) {}
+
+  bool IsAllocThreadSafe() const override { return true; }
+
+ protected:
+  void FreeImpl(phi::Allocation* allocation) override {
+    MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                       -allocation->size());
+    underlying_allocator_->Free(allocation);
+  }
+
+  phi::Allocation* AllocateImpl(size_t size) override {
+    phi::Allocator::AllocationPtr allocation =
+        underlying_allocator_->Allocate(size);
+    MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                       allocation->size());
+    return allocation.release();
+  }
+
+  uint64_t ReleaseImpl(const platform::Place& place) override {
+    return underlying_allocator_->Release(place);
+  }
+
+ private:
+  std::shared_ptr<Allocator> underlying_allocator_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h
@@ -16,3 +16,4 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/memory/stats.h"
diff --git a/paddle/fluid/memory/stats.cc b/paddle/fluid/memory/stats.cc
@@ -0,0 +1,118 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/stats.h"
+
+#include "paddle/fluid/memory/allocation/spin_lock.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace memory {
+
+class StatRegistry {
+ public:
+  static StatRegistry* GetInstance() {
+    static StatRegistry instance;
+    return &instance;
+  }
+
+  StatBase* GetStat(const std::string& stat_type, int dev_id) {
+    auto it = stat_map_.find(GetStatKey(stat_type, dev_id));
+    if (it == stat_map_.end()) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The STAT type \"%s\" for device %d has not been regeistered.",
+          stat_type.c_str(), dev_id));
+    }
+    return it->second;
+  }
+
+  std::string GetStatKey(const std::string& stat_type, int dev_id) {
+    return "STAT_Device" + std::to_string(dev_id) + "_" + stat_type;
+  }
+
+  int64_t GetCurrentValue(const std::string& stat_type, int dev_id) {
+    return GetStat(stat_type, dev_id)->GetCurrentValue();
+  }
+
+  int64_t GetPeakValue(const std::string& stat_type, int dev_id) {
+    return GetStat(stat_type, dev_id)->GetPeakValue();
+  }
+
+  void Register(const std::string& stat_type, int dev_id, StatBase* stat) {
+    std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
+    stat_map_[GetStatKey(stat_type, dev_id)] = stat;
+  }
+
+  void Unregister(const std::string& stat_type, int dev_id) {
+    std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
+    stat_map_.erase(GetStatKey(stat_type, dev_id));
+  }
+
+  void Update(const std::string& stat_type, int dev_id, int64_t increment) {
+    stat_map_[GetStatKey(stat_type, dev_id)]->Update(increment);
+  }
+
+ private:
+  StatRegistry() = default;
+
+  DISABLE_COPY_AND_ASSIGN(StatRegistry);
+
+  std::unordered_map<std::string, StatBase*> stat_map_;
+  SpinLock stat_map_lock_;
+};
+
+int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetCurrentValue(stat_type, dev_id);
+}
+
+int64_t StatGetPeakValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetPeakValue(stat_type, dev_id);
+}
+
+void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment) {
+  StatRegistry::GetInstance()->Update(stat_type, dev_id, increment);
+}
+
+#define MEMORY_STAT_REGISTER_WITH_ID(item, id) \
+  StatRegistry::GetInstance()->Register(       \
+      #item, id, Stat<ThreadLocalStatDevice##id##item>::GetInstance());
+
+#define MEMORY_STAT_REGISTER(item)        \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 0);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 1);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 2);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 3);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 4);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 5);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 6);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 7);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 8);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 9);  \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
+  MEMORY_STAT_REGISTER_WITH_ID(item, 15)
+
+int RegisterAllStats() {
+  MEMORY_STAT_REGISTER(Allocated);
+  MEMORY_STAT_REGISTER(Reserved);
+  return 0;
+}
+
+UNUSED static int regiester_all_stats = RegisterAllStats();
+
+}  // namespace memory
+}  // namespace paddle