-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Refactor and simplify hook design & add Tensor.register_hook API #31775
Changes from 11 commits
c0b947f
b4b3e9f
16b3dcd
2553179
2fac74f
de8b2df
665b15b
118cc07
aa68578
e8f799a
21eceec
d5468e5
c0838dc
dbd3c34
fe79a89
7c9fd70
11c26a9
ef087a5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -141,17 +141,6 @@ void BasicEngine::PrepareGradAccumulators( | |
<< var.get() | ||
<< ") that don't have grad node with reference count " | ||
<< accumulator->RefCnt(); | ||
|
||
if (var->HasLeafHooks()) { | ||
VLOG(3) << "Grad variable wrapper (" << var->Name() | ||
<< ") has leaf grad hooks."; | ||
PADDLE_ENFORCE_NE( | ||
var->HasGradNode(), true, | ||
platform::errors::PermissionDenied( | ||
"Only leaf Tensor's gradient can append hook to " | ||
"Gradientaccumulator.")); | ||
accumulator->SetPostHooks(var->GetLeafHooks()); | ||
} | ||
} else { | ||
// Because Inplace op overwrites the grad_node of the input grad_var. So | ||
// only the information of grad_pending_node can be used to find the | ||
|
@@ -292,10 +281,25 @@ void BasicEngine::Execute() { | |
auto& bwd_ins = cur_op.GetInsMap(); | ||
auto& bwd_outs = cur_op.GetOutsMap(); | ||
|
||
/** | ||
* [ Why need temporary inputs and outputs here? ] | ||
* | ||
* 1. For inputs | ||
* - Hook execution should not change original input tensor. | ||
* User can register hook for Tensor's gradient, It is expected | ||
* that the hook only affects the gradient of the backward | ||
* propagation, and does not affect the gradient value input | ||
* as the hook. | ||
* | ||
* 2. For outputs | ||
* | ||
* - construct the temp output map, avoid to disrupt graph | ||
* - replace the element in the map by temp var, because a | ||
* var may be coresponding to several grad var in one op | ||
*/ | ||
NameVarMap<VariableWrapper> tmp_ins(bwd_ins); | ||
NameVarMap<VariableWrapper> tmp_outs(bwd_outs); | ||
// 1. construct the temp output map, avoid to disrupt graph | ||
// 2. replace the element in the map by temp var, because a | ||
// var may be coresponding to several grad var in one op | ||
|
||
for (auto& pair : tmp_outs) { | ||
if (!pair.second.IsGrad()) { | ||
continue; | ||
|
@@ -366,7 +370,7 @@ void BasicEngine::Execute() { | |
// If a tmp var has been created, there is no need to create it | ||
// again. | ||
for (auto& in_var : | ||
bwd_ins.at(inplace_grad_name_map.at(pair.first))) { | ||
tmp_ins.at(inplace_grad_name_map.at(pair.first))) { | ||
if (in_var == var) { | ||
auto tmp_var = std::make_shared<VariableWrapper>(var->Name()); | ||
tmp_var->SetType(var->Type()); | ||
|
@@ -385,7 +389,7 @@ void BasicEngine::Execute() { | |
|
||
VLOG(4) << "Check whether there is any inplace operation affecting " | ||
"gradient calculation."; | ||
for (auto& pair : bwd_ins) { | ||
for (auto& pair : tmp_ins) { | ||
for (auto& var_wrapper : pair.second) { | ||
auto wrapper_version_snapshot = var_wrapper->InplaceVersionSnapshot(); | ||
auto tensor_version = | ||
|
@@ -408,9 +412,25 @@ void BasicEngine::Execute() { | |
} | ||
} | ||
|
||
for (auto& pair : tmp_ins) { | ||
for (size_t i = 0; i < pair.second.size(); ++i) { | ||
auto& var = pair.second[i]; | ||
if (var->HasHook()) { | ||
VLOG(3) << "Call " << var->GetHooks().size() << " hooks of " | ||
<< cur_op.Type() << "'s input `" << pair.first | ||
<< "`'s var `" << var->Name() << "`."; | ||
auto tmp_var = var; | ||
for (const auto& hook_pair : var->GetHooks()) { | ||
tmp_var = (*hook_pair.second)(tmp_var); | ||
} | ||
tmp_ins[pair.first][i] = tmp_var; | ||
} | ||
} | ||
} | ||
|
||
{ | ||
VLOG(3) << "Start to execute grad op " << cur_op.Type(); | ||
OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(), | ||
OpBase::Run(cur_op.InnerOp(), tmp_ins, tmp_outs, cur_op.Attrs(), | ||
cur_op.place()); | ||
} | ||
|
||
|
@@ -428,15 +448,14 @@ void BasicEngine::Execute() { | |
if (!accumulator->SumGradCompleted()) { | ||
continue; | ||
} | ||
// 1. Call Hooks for **inner_var_** | ||
// 1. Call Hooks for `inner_var_` | ||
accumulator->CallHooks(); | ||
|
||
// 2. Sum Gradient with Previous Graph | ||
// 2. Sum Gradient `inner_var_` to `var_` of Current or Previous Graph | ||
accumulator->AccumulateGrad(); | ||
|
||
// 3. Call backward Hooks for **var_** | ||
if (accumulator->HasPostHooks()) { | ||
accumulator->CallBackwardPostHooks(); | ||
} | ||
// 3. Call backward Hooks for `var_` | ||
accumulator->CallReduceHooks(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bad name, or may use inherent to fix it? CallHooks indicates invoke all hooks, but CallReduceHooks make it confused to me There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done, |
||
} | ||
|
||
need_accu_var_list_.clear(); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -385,8 +385,8 @@ static platform::Place GetPlaceOfVar( | |
|
||
void GradientAccumulator::AccumulateGrad() { | ||
/** | ||
* If the gradient has been calculated by previous graph, | ||
* it should be added to the previous graph result. | ||
* If the leaf gradient has been calculated done, the inner_var_ | ||
* should be added to the var_. | ||
*/ | ||
if (!var_->IsLeafGrad() || !SumGradCompleted() || !HasInnerVar()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. !HasInnerVar() 这个应该能去掉了 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个不能吧,现在每次调用AccumulatedGrad仍然要求有InnerVar的 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 嗯对 |
||
return; | ||
|
@@ -397,7 +397,7 @@ void GradientAccumulator::AccumulateGrad() { | |
"this auto-grad")); | ||
PADDLE_ENFORCE_EQ(inner_var_->Var().IsInitialized(), true, | ||
platform::errors::InvalidArgument( | ||
"Interior var of Leaf tensor should be initialized.")); | ||
"Interior var of Leaf tensor should be initialized.")); | ||
auto* src = inner_var_->MutableVar(); | ||
auto* dst = var_->MutableVar(); | ||
if (!var_->IsEmpty()) { | ||
|
@@ -428,10 +428,46 @@ void GradientAccumulator::AccumulateGrad() { | |
*(dst) = std::move(*src); | ||
var_->SetType(inner_var_->Type()); | ||
var_->SetDataType(inner_var_->DataType()); | ||
var_->SetIsEmpty(false); | ||
} | ||
inner_var_.reset(); | ||
} | ||
|
||
void GradientAccumulator::CallHooks() { | ||
if (!var_->IsLeafGrad() || !SumGradCompleted() || !HasInnerVar()) { | ||
return; | ||
} | ||
PADDLE_ENFORCE_EQ( | ||
HasInnerVar(), true, | ||
platform::errors::InvalidArgument( | ||
"Leaf Tensor's inner var is nullptr when call gradient hook.")); | ||
PADDLE_ENFORCE_EQ(inner_var_->Var().IsInitialized(), true, | ||
platform::errors::InvalidArgument("Leaf Tensor's inner var " | ||
"is not initialized when " | ||
"call gradient hook.")); | ||
if (var_->HasHook()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. seal this or make it has difference with the same code in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. only for loop is similar |
||
VLOG(3) << "Call " << var_->GetHooks().size() | ||
<< " hooks of leaf gradient accumulator's inner var `" | ||
<< var_->Name() << "`."; | ||
auto tmp_var = inner_var_; | ||
VLOG(3) << "Input var " << var_->Name() << "'s hook size - " | ||
<< var_->GetHooks().size(); | ||
for (const auto& hook_pair : var_->GetHooks()) { | ||
tmp_var = (*hook_pair.second)(tmp_var); | ||
} | ||
inner_var_ = tmp_var; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 叶子节点在GradientAccumulator里面做CallGradientHooks就会替代自己内部的inner_var_,相当于inplace了吧 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 是的,本来就是inplace的,这里改成这样,主要目的是统一hook的基类管理和调用,如果这里使用InplaceHook,那之前的HookPipeLine那些就仍然需要,数据结构和逻辑都会比较复杂 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK |
||
} | ||
} | ||
|
||
void GradientAccumulator::CallReduceHooks() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do some check to differ it with normal hook There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
if (var_->HasReduceHook()) { | ||
for (const auto& hook : var_->GetReduceHooks()) { | ||
VLOG(3) << "call gradient accumulator backward hooks."; | ||
(*hook)(var_); | ||
} | ||
} | ||
} | ||
|
||
void EagerGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var, | ||
size_t trace_id, bool unchange_input) { | ||
/** | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,8 +40,8 @@ class GradientAccumulator { | |
} | ||
|
||
// inner_var_ record the grad of this auto-grad. | ||
// Only need to generate inner var for non-empty leaf-tensor. | ||
if (var->IsLeafGrad() && !var->IsEmpty()) { | ||
// Only need to generate inner var for leaf-tensor. | ||
if (var->IsLeafGrad()) { | ||
inner_var_ = std::make_shared<VariableWrapper>(var->Name()); | ||
inner_var_->SetType(var->Type()); | ||
inner_var_->SetDataType(var->DataType()); | ||
|
@@ -52,9 +52,6 @@ class GradientAccumulator { | |
<< ") to store result of this Graph"; | ||
} | ||
|
||
// TODO(zhouwei): fix Tensor.clear_gradient() bug, remove this hard flag | ||
var->SetIsEmpty(false); | ||
|
||
// var_ is the final grad, processed by hooks and grad accumulation | ||
var_ = var; | ||
} | ||
|
@@ -93,42 +90,38 @@ class GradientAccumulator { | |
|
||
inline bool HasInnerVar() const { return inner_var_ != nullptr; } | ||
|
||
/* Hook related methods */ | ||
inline bool HasPostHooks() const { return !post_hooks_.expired(); } | ||
|
||
void SetPostHooks(const std::shared_ptr<LeafVarHookPipeline>& hooks) { | ||
PADDLE_ENFORCE_NOT_NULL( | ||
hooks, platform::errors::InvalidArgument( | ||
"The hook set to GradientAccumulator is nullptr.")); | ||
|
||
auto shared_hooks = post_hooks_.lock(); | ||
if (shared_hooks != hooks) { | ||
PADDLE_ENFORCE_EQ( | ||
shared_hooks, nullptr, | ||
platform::errors::PermissionDenied( | ||
"Cannot set post hooks twice to GradientAccumulator.")); | ||
post_hooks_ = hooks; | ||
} | ||
} | ||
// void CallHooks(){} | ||
// ** inner_var_ ** | ||
|
||
// function that Sum Gradient with Previous Graph | ||
void AccumulateGrad(); | ||
|
||
// call backward post hooks, such as reduce hook | ||
void CallBackwardPostHooks() { | ||
PADDLE_ENFORCE_NE( | ||
post_hooks_.expired(), true, | ||
platform::errors::NotFound( | ||
"The post hooks of GradientAccumulator for Tensor `%s` expired.", | ||
var_->Name())); | ||
auto shared_hooks = post_hooks_.lock(); | ||
for (const auto& hook : shared_hooks->backward_hooks()) { | ||
VLOG(3) << "call gradient accumulator backward hooks."; | ||
(*hook)(var_); | ||
} | ||
} | ||
/** [ Hook related methods ] | ||
* | ||
* [Why need two types of VariableWrapperHook? ] | ||
* | ||
* There are two types of gradient accumulation: | ||
* 1. Gradient accumulation in same batch | ||
* 2. Gradient accumulation across batchs | ||
* The order of execution between Hooks and gradient accumulation: | ||
|
||
* [ Gradient accumulation in same batch] | ||
* | | ||
* [ leaf GradVarBase hooks ] | ||
* | | ||
* [ Gradient accumulation across batchs ] | ||
* | | ||
* [ Gradient reduce / allreduce hooks ] | ||
|
||
* Because we currently intend to accumulate these two gradient | ||
* accumulation in one GradientAccumulator, We must distinguish between | ||
* two types of hooks. | ||
|
||
* And the InplaceVariableWrapperHook does not allow users to register | ||
* directly, and is currently only used to support the reduce strategy of | ||
* parallel multi-card training. | ||
*/ | ||
|
||
void CallHooks(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make this two func not a parallel structure with related name There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done, thx |
||
|
||
void CallReduceHooks(); | ||
|
||
protected: | ||
VariableWrapper* var_; | ||
|
@@ -137,7 +130,6 @@ class GradientAccumulator { | |
std::shared_ptr<VariableWrapper> inner_var_; | ||
size_t ref_cnt_{0}; | ||
size_t cur_cnt_{0}; | ||
std::weak_ptr<LeafVarHookPipeline> post_hooks_; | ||
}; | ||
|
||
class EagerGradientAccumulator : public GradientAccumulator { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how about create tmp_ins only when it needed, it seems make too many tmp variable_wrapper copy here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done