Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support CostInfo and MemProfiler in InterpreterCore #34981

Merged
merged 13 commits into from
Aug 31, 2021
52 changes: 51 additions & 1 deletion paddle/fluid/framework/new_executor/interpretercore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,9 @@ void InterpreterCore::BuildInstructionCtx(Instruction* instr_node,
}

void InterpreterCore::RunInstruction(const Instruction& instr_node) {
VLOG(3) << "RunInstruction: "
<< instr_node.kernel_func_.operator_base_->Type();

static_cast<const framework::OperatorWithKernel*>(
instr_node.kernel_func_.operator_base_)
->InferShape(instr_node.infershape_ctx_.get());
Expand All @@ -364,7 +367,7 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {

void InterpreterCore::ExecuteInstructionList(
const std::vector<Instruction>& vec_instr, const VariableScope& var_scope,
const platform::Place& place) {
const platform::Place& place, bool is_dry_run) {
std::queue<size_t> working_queue;
auto working_dependecy_count = dependecy_count_;
for (size_t i = 0; i < dependecy_count_.size(); ++i) {
Expand All @@ -385,6 +388,11 @@ void InterpreterCore::ExecuteInstructionList(
// step2: run instruction
RunInstruction(instr_node);
++run_op_number;

if (is_dry_run) {
profiler_.ParseMemoryInfo(var_scope.var_list);
}

// step3: insert event for out_vars if needed
RecordEventInstruction(instr_node, vec_func_list_[instr_id]);

Expand Down Expand Up @@ -827,6 +835,48 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
VLOG(3) << "run " << op_base->Type() << " done.";
}
}
void InterpreterCore::Prepare(
const std::vector<framework::Tensor>& feed_tensors) {
auto FeedInput = [&] {
for (size_t i = 0; i < feed_names_.size(); ++i) {
auto it = global_scope_->name2id.find(feed_names_[i]);
assert(it != global_scope_->name2id.end());

auto feed_tensor = global_scope_->var_list[it->second]
->GetMutable<framework::LoDTensor>();
feed_tensor->ShareDataWith(feed_tensors[i]);
}
};

if (is_build_ == false) {
BuildVariableScope(main_program_, global_scope_);
FeedInput();
BuildOpFuncList(place_, main_program_, &op_list_, &vec_func_list_,
global_scope_);
is_build_ = true;
// convert vec func_list to graph
Convert();
}
// NOTE: Because feed_tensor will be GC after BuildOpFuncList, so we should
// call
// FeedInput again.
FeedInput();
}

const CostInfo& InterpreterCore::DryRun(
const std::vector<framework::Tensor>& feed_tensors) {
Prepare(feed_tensors);
// DryRun may be called many times.
profiler_.Reset();
profiler_.Start();
ExecuteInstructionList(vec_instruction_, *global_scope_, place_,
/*is_dry_run=*/true);
platform::DeviceContextPool::Instance().Get(place_)->Wait();

profiler_.Pause();
profiler_.TotalCUDAAllocatedMemorySize(place_);
return profiler_.GetCostInfo();
}

platform::DeviceContext* InterpreterCore::ParseDeviceContextForInstruction(
const OpFuncNode& op_func_node, const OperatorBase& op_base) {
Expand Down
9 changes: 8 additions & 1 deletion paddle/fluid/framework/new_executor/interpretercore.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include "paddle/fluid/framework/new_executor/interpretercore_util.h"
#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
#include "paddle/fluid/framework/new_executor/profiler.h"
#include "paddle/fluid/framework/new_executor/workqueue.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/tensor.h"
Expand All @@ -41,6 +42,8 @@ class InterpreterCore {
paddle::framework::FetchList Run(
const std::vector<framework::Tensor>& feed_tensors);

const CostInfo& DryRun(const std::vector<framework::Tensor>& feed_tensors);

static void BuildOpFuncList(const platform::Place& place,
const framework::ProgramDesc& pdesc,
std::vector<OperatorBase*>* op_list,
Expand All @@ -58,14 +61,17 @@ class InterpreterCore {

void ExecuteInstructionList(const std::vector<Instruction>& vec_instr,
const VariableScope& var_scope,
const platform::Place& place);
const platform::Place& place,
bool is_dry_run = false);

std::vector<size_t> MergeVector(const std::vector<size_t>& first,
const std::vector<size_t>& second);

void BuildVariableScope(const framework::ProgramDesc& pdesc,
VariableScope* var_scope);

void Prepare(const std::vector<framework::Tensor>& feed_tensors);

void CheckGC(size_t instr_id, const std::vector<size_t>& gc_check_list,
const VariableScope& var_scope, const platform::Place& place,
std::vector<VariableMetaInfo>& working_var_ref); // NOLINT
Expand Down Expand Up @@ -100,6 +106,7 @@ class InterpreterCore {
bool is_build_;

std::vector<std::string> feed_names_;
InterpreterProfiler profiler_;
std::map<size_t, std::shared_ptr<platform::DeviceEvent>> var_id2event_;

std::vector<paddle::platform::DeviceEvent> gc_event_;
Expand Down
121 changes: 121 additions & 0 deletions paddle/fluid/framework/new_executor/profiler.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/timer.h"

namespace paddle {
namespace framework {

static void GetTensors(Variable* var, std::unordered_set<Tensor*>* tensor_set) {
if (var->IsType<LoDTensor>() && var->Get<LoDTensor>().IsInitialized()) {
tensor_set->insert(var->GetMutable<LoDTensor>());
} else if (var->IsType<SelectedRows>() &&
var->Get<SelectedRows>().value().IsInitialized()) {
tensor_set->insert(var->GetMutable<SelectedRows>()->mutable_value());
} else if (var->IsType<LoDTensorArray>()) {
auto* tensor_arr = var->GetMutable<LoDTensorArray>();
for (auto& t : *tensor_arr) {
if (t.IsInitialized()) {
tensor_set->insert(&t);
}
}
}
}

static std::pair<size_t, size_t> GetTensorMemorySize(
const std::vector<Variable*>& var_list) {
std::unordered_set<Tensor*> tensor_set;
for (auto* var : var_list) {
GetTensors(var, &tensor_set);
}
size_t host_memory_bytes = 0;
size_t device_memory_bytes = 0;
std::unordered_set<memory::Allocation*> allocation_set;
for (auto* tensor : tensor_set) {
auto allocation = tensor->Holder().get();
if (!allocation_set.count(allocation)) {
allocation_set.insert(allocation);
if (platform::is_cuda_pinned_place(tensor->place()) ||
platform::is_cpu_place(tensor->place())) {
VLOG(3) << "found host memory : " << allocation->size();
host_memory_bytes += allocation->size();
} else {
VLOG(3) << "found device memory : " << allocation->size();
device_memory_bytes += allocation->size();
}
}
}
return {host_memory_bytes, device_memory_bytes};
}

struct CostInfo {
double total_time{0.}; // ms
size_t host_memory_bytes{0}; // bytes
size_t device_memory_bytes{0}; // bytes
size_t device_total_memory_bytes{0}; // total allocated memory size
};

class InterpreterProfiler {
public:
void Start() { timer_.Start(); }

void Pause() {
timer_.Pause();
cost_info_.total_time += timer_.ElapsedMS();
}

void Reset() {
timer_.Reset();
cost_info_.total_time = 0.;
cost_info_.host_memory_bytes = 0;
cost_info_.device_memory_bytes = 0;
cost_info_.device_total_memory_bytes = 0;
}

void ParseMemoryInfo(const std::vector<Variable*>& vars) {
timer_.Start();
auto memory_info = GetTensorMemorySize(vars);
VLOG(3) << "host memory size: " << memory_info.first;
cost_info_.host_memory_bytes =
std::max(cost_info_.host_memory_bytes, memory_info.first);

VLOG(3) << "device memory size: " << memory_info.second;
cost_info_.device_memory_bytes =
std::max(cost_info_.device_memory_bytes, memory_info.second);
timer_.Pause();
cost_info_.total_time -= timer_.ElapsedMS();
}

void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place);
cost_info_.device_total_memory_bytes =
platform::RecordedCudaMallocSize(cuda_place.device);
#endif
}
}

const CostInfo& GetCostInfo() const { return cost_info_; }

private:
platform::Timer timer_;
CostInfo cost_info_;
};
} // namespace framework
} // namespace paddle
9 changes: 9 additions & 0 deletions paddle/fluid/framework/new_executor/standalone_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,15 @@ paddle::framework::FetchList StandaloneExecutor::Run(
return core->Run(feed_tensors);
}

const CostInfo& StandaloneExecutor::DryRun(
const std::vector<std::string>& feed_names,
const std::vector<framework::Tensor>& feed_tensors) {
auto core = GetInterpreterCore(feed_names, {});

auto& cost_info = core->DryRun(feed_tensors);
return cost_info;
}

void StandaloneExecutor::BuildVariableOuterScope(
const framework::ProgramDesc& pdesc, VariableScope* var_scope,
Scope* outer_scope) {
Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/framework/new_executor/standalone_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ class StandaloneExecutor : public ExecutorBase {
const std::vector<framework::Tensor>& feed_tensors,
const std::vector<std::string>& fetch_names);

const CostInfo& DryRun(const std::vector<std::string>& feed_names,
const std::vector<framework::Tensor>& feed_tensors);

private:
void BuildVariableOuterScope(const framework::ProgramDesc& pdesc,
VariableScope* var_scope, Scope* outer_scope);
Expand Down
20 changes: 6 additions & 14 deletions paddle/fluid/platform/gpu_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,7 @@ class RecordedCudaMallocHelper {
*/
gpuError_t Malloc(void **ptr, size_t size) {
LockGuardPtr<std::mutex> lock(mtx_);
if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) {
if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
#ifdef PADDLE_WITH_HIP
return hipErrorOutOfMemory;
#else
Expand All @@ -514,9 +514,7 @@ class RecordedCudaMallocHelper {
auto result = cudaMalloc(ptr, size);
#endif
if (result == gpuSuccess) {
if (NeedRecord()) {
cur_size_ += size;
}
cur_size_.fetch_add(size);
STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
return gpuSuccess;
} else {
Expand Down Expand Up @@ -551,10 +549,7 @@ class RecordedCudaMallocHelper {
if (err != cudaErrorCudartUnloading) {
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(err);
if (NeedRecord()) {
std::lock_guard<std::mutex> guard(*mtx_);
cur_size_ -= size;
}
cur_size_.fetch_sub(size);
STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
} else {
#ifdef PADDLE_WITH_HIP
Expand Down Expand Up @@ -582,7 +577,7 @@ class RecordedCudaMallocHelper {

if (NeedRecord()) {
std::lock_guard<std::mutex> guard(*mtx_);
*avail = std::min(*actual_avail, limit_size_ - cur_size_);
*avail = std::min(*actual_avail, limit_size_ - cur_size_.load());
*total = std::min(*actual_total, limit_size_);
return *total < *actual_total;
} else {
Expand All @@ -594,17 +589,14 @@ class RecordedCudaMallocHelper {

inline bool NeedRecord() const { return limit_size_ != 0; }

uint64_t RecordedSize() const {
LockGuardPtr<std::mutex> lock(mtx_);
return NeedRecord() ? cur_size_ : 0;
}
uint64_t RecordedSize() const { return cur_size_.load(); }

uint64_t LimitSize() const { return limit_size_; }

private:
const int dev_id_;
const uint64_t limit_size_;
uint64_t cur_size_{0};
std::atomic<uint64_t> cur_size_{0};

mutable std::unique_ptr<std::mutex> mtx_;

Expand Down
32 changes: 31 additions & 1 deletion paddle/fluid/pybind/pybind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1951,6 +1951,16 @@ All parameter, weight, gradient are variables in Paddle.
fetch_vars);
});

py::class_<framework::CostInfo>(m, "CostInfo")
.def(py::init<>())
.def("total_time", [](CostInfo &self) { return self.total_time; })
.def("host_memory_bytes",
[](CostInfo &self) { return self.host_memory_bytes; })
.def("device_memory_bytes",
[](CostInfo &self) { return self.device_memory_bytes; })
.def("device_total_memory_bytes",
[](CostInfo &self) { return self.device_total_memory_bytes; });

py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor")
.def(py::init<const platform::Place &, const ProgramDesc &,
const ProgramDesc &, Scope *>())
Expand All @@ -1974,8 +1984,28 @@ All parameter, weight, gradient are variables in Paddle.
pybind11::gil_scoped_release release;
ret = self.Run(feed_names, feed_tensors, fetch_names);
}

return py::cast(std::move(ret));
})
.def("dry_run",
[](StandaloneExecutor &self,
const std::unordered_map<std::string, py::array> &input_dict) {
std::vector<framework::Tensor> feed_tensors;
std::vector<std::string> feed_names;

for (auto &item : input_dict) {
framework::LoDTensor t;
SetTensorFromPyArray<platform::CPUPlace>(
&t, item.second, platform::CPUPlace(), false);
feed_names.push_back(item.first);
feed_tensors.push_back(t);
}

CostInfo cost_info;
{
pybind11::gil_scoped_release release;
cost_info = self.DryRun(feed_names, feed_tensors);
}
return cost_info;
});

m.def("init_gflags", framework::InitGflags);
Expand Down
Loading