Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[new-exec] enable check_nan_inf #36802

Merged
merged 7 commits into from
Oct 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion paddle/fluid/framework/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ cc_test(reader_test SRCS reader_test.cc DEPS reader)
cc_library(threadpool SRCS threadpool.cc DEPS enforce)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)

cc_library(var_type_traits SRCS var_type_traits.cc DEPS lod_tensor selected_rows framework_proto)
cc_library(var_type_traits SRCS var_type_traits.cc DEPS lod_tensor selected_rows framework_proto scope)
if (WITH_GPU)
target_link_libraries(var_type_traits dynload_cuda)
endif()
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/framework/details/nan_inf_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ namespace framework {
namespace details {
// assert false when meets NAN or inf
void CheckVarHasNanOrInf(const std::string& op_type,
const framework::Scope& scope,
const framework::ScopeBase& scope,
const std::string& var_name,
const platform::Place& place);

Expand All @@ -37,7 +37,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
const platform::Place& place);

void CheckOpHasNanOrInf(const framework::OperatorBase& op,
const framework::Scope& scope,
const framework::ScopeBase& scope,
const platform::Place& place);

template <typename VarType>
Expand All @@ -55,7 +55,7 @@ void CheckOpHasNanOrInfInDygraph(const std::string& op_type,

#ifdef PADDLE_WITH_ASCEND_CL
void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
const framework::Scope& scope,
const framework::ScopeBase& scope,
const platform::Place& place);
#endif

Expand Down
15 changes: 7 additions & 8 deletions paddle/fluid/framework/details/nan_inf_utils_detail.cc
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
}

void CheckVarHasNanOrInf(const std::string& op_type,
const framework::Scope& scope,
const framework::ScopeBase& scope,
const std::string& var_name,
const platform::Place& place) {
auto* var = scope.FindVar(var_name);
Expand Down Expand Up @@ -440,7 +440,7 @@ static framework::Tensor& npu_float_status() {
}

void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
const framework::Scope& scope,
const framework::ScopeBase& scope,
const platform::Place& place) {
if (!platform::is_npu_place(place)) return;

Expand Down Expand Up @@ -505,7 +505,7 @@ void PrintNpuVarInfo(const std::string& op_type, const std::string& var_name,
}

void PrintNPUOpValueInfo(const framework::OperatorBase& op,
const framework::Scope& scope,
const framework::ScopeBase& scope,
const platform::Place& place) {
LOG(WARNING) << "There are `nan` or `inf` in operator (" << op.Type()
<< "), here we print some tensor value info of this op.";
Expand All @@ -523,7 +523,7 @@ void PrintNPUOpValueInfo(const framework::OperatorBase& op,
}

static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,
const framework::Scope& scope,
const framework::ScopeBase& scope,
const platform::Place& place) {
if (!platform::is_npu_place(place)) return;

Expand Down Expand Up @@ -551,14 +551,13 @@ static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,

if (sum >= 1.0) PrintNPUOpValueInfo(op, scope, place);

PADDLE_ENFORCE_LT(
sum, 1.0, platform::errors::PreconditionNotMet(
"Operator %s contains Nan/Inf.", op.DebugStringEx(&scope)));
PADDLE_ENFORCE_LT(sum, 1.0, platform::errors::PreconditionNotMet(
"Operator %s contains Nan/Inf.", op.Type()));
}
#endif

void CheckOpHasNanOrInf(const framework::OperatorBase& op,
const framework::Scope& exec_scope,
const framework::ScopeBase& exec_scope,
const platform::Place& place) {
std::call_once(white_list_init_flag, InitWhiteListFormEnv);

Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/new_executor/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set(INTERPRETERCORE_DEPS op_registry device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor)
graph_to_program_pass variable_helper timer monitor nan_inf_utils)

cc_library(workqueue SRCS workqueue.cc workqueue_utils.cc DEPS enforce)
cc_library(interpretercore_garbage_collector SRCS interpretercore_garbage_collector.cc DEPS workqueue ${DEVICE_EVENT_LIBS})
Expand Down
18 changes: 14 additions & 4 deletions paddle/fluid/framework/new_executor/interpretercore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,15 @@

#include <unordered_set>

#include "paddle/fluid/framework/details/nan_inf_utils.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
#include "paddle/fluid/platform/profiler.h"

PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true,
"Use inplace in new executor");

DECLARE_bool(check_nan_inf);

constexpr const char* kExceptionCaught = "ExceptionCaught";

namespace paddle {
Expand Down Expand Up @@ -80,7 +83,6 @@ paddle::framework::FetchList InterpreterCore::Run(
auto FeedInput = [&] {
for (size_t i = 0; i < feed_names_.size(); ++i) {
auto* feed_var = global_scope_->Var(feed_names_[i]);

auto feed_tensor = feed_var->GetMutable<framework::LoDTensor>();
feed_tensor->ShareDataWith(feed_tensors[i]);
}
Expand Down Expand Up @@ -246,10 +248,10 @@ void InterpreterCore::BuildInplace() {
auto outvar = global_scope_->Var(iterout->second[0]);
if (invar && outvar) {
instr.AddInplace(invar, outvar);
VLOG(3) << "inplace " << op_base->Type() << " "
<< global_scope_->VarDesc(iter->second[0])->Name()
VLOG(3) << "inplace " << vec_instruction_[i].OpBase()->Type()
<< " " << global_scope_->GetNameById(iter->second[0])
<< " -> "
<< global_scope_->VarDesc(iterout->second[0])->Name()
<< global_scope_->GetNameById(iterout->second[0])
<< std::endl;
}
}
Expand Down Expand Up @@ -330,6 +332,14 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
platform::RecordEvent compute_event("Compute");
instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
}

// for debug nan/inf
if (FLAGS_check_nan_inf) {
VLOG(4) << "Check nan/inf";
framework::details::CheckOpHasNanOrInf(
*instr_node.OpBase(), *global_scope_,
instr_node.DeviceContext().GetPlace());
}
}

void InterpreterCore::ExecuteInstructionList(
Expand Down
69 changes: 49 additions & 20 deletions paddle/fluid/framework/new_executor/new_executor_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -471,44 +471,73 @@ struct VariableMetaInfo {
paddle::framework::VarDesc* vardesc_;
};

// TODO(Aurelius84): Consider inherit ScopeBase to unify interface.
class VariableScope {
// TODO(zhiqiu): Maybe we need to add rwlock for VariableScope?
class VariableScope : public ScopeBase {
public:
Variable* FindVar(const std::string& name) const {
if (!HasVar(name)) {
return nullptr;
auto it = name2id_.find(name);
if (it != name2id_.end()) {
PADDLE_ENFORCE_LT(it->second, var_list_.size(),
platform::errors::NotFound(
"The id(%d) of variable(%s) should not be larger "
"than the size of variable list(%d).",
it->second, name, var_list_.size()));
return var_list_[it->second];
}
auto var_id = VarId(name);
CheckExist(var_id);
return var_list[var_id];
return nullptr;
}

// Get variable id by name, return -1 if not found
int GetIdByName(const std::string& name) const {
auto it = name2id_.find(name);
if (it != name2id_.end()) {
return it->second;
}
return -1;
}

// Get variable name by id, return "" if not found
std::string GetNameById(int id) const {
// NOTE(zhiqiu): do not use vec_meta_info_[id].vardesc_->Name() since
// vec_meta_info_[id] may be nullptr,
// typically when the target variable is not existed in the original program
// desc, but created by interpretercore.
// For example, created and used by d2h_copy or h2d_copy operator.
auto it =
std::find_if(name2id_.begin(), name2id_.end(),
[id](const auto& pair) { return pair.second == id; });
if (it != name2id_.end()) {
return it->first;
}
return "";
}

bool HasVar(const std::string& name) const {
return name2id.find(name) != name2id.end();
return name2id_.find(name) != name2id_.end();
}

int VarId(const std::string& name) const {
CheckExist(name);
return name2id.at(name);
return name2id_.at(name);
}

Variable* Var(int id) const { return var_list.at(id); }
Variable* Var(int id) const { return var_list_.at(id); }

Variable* Var(const std::string& name) const {
return var_list.at(VarId(name));
return var_list_.at(VarId(name));
}

size_t VarSize() const { return var_list.size(); }
size_t VarSize() const { return var_list_.size(); }

void AddVar(const std::string& name, VarDesc* var_desc) { // NOLINT
name2id[name] = VarSize();
name2id_[name] = VarSize();
auto v = new Variable();
if (nullptr == var_desc) {
v->GetMutable<LoDTensor>();
} else {
InitializeVariable(v, var_desc->GetType());
}
var_list.push_back(v);
var_list_.push_back(v);

VariableMetaInfo info;
info.var_ref_count_ = 0;
Expand All @@ -517,8 +546,8 @@ class VariableScope {
}

void AddVar(const std::string& name, Variable& var) { // NOLINT
name2id[name] = VarSize();
var_list.push_back(&var);
name2id_[name] = VarSize();
var_list_.push_back(&var);

VariableMetaInfo info;
info.var_ref_count_ = 0;
Expand All @@ -540,10 +569,10 @@ class VariableScope {
}

void CheckExist(int id) const {
PADDLE_ENFORCE_LT(id, var_list.size(),
PADDLE_ENFORCE_LT(id, var_list_.size(),
platform::errors::PreconditionNotMet(
"Required var_id < %d, but received var_id = %d.",
var_list.size(), id));
var_list_.size(), id));
}

void CheckExist(const std::string& name) const {
Expand All @@ -553,8 +582,8 @@ class VariableScope {
}

private:
std::vector<Variable*> var_list;
std::map<std::string, int> name2id;
std::vector<Variable*> var_list_;
std::map<std::string, int> name2id_;
std::vector<VariableMetaInfo> vec_meta_info_;
};

Expand Down
12 changes: 11 additions & 1 deletion paddle/fluid/framework/scope.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,16 @@ class Variable;
namespace paddle {
namespace framework {

// TODO(zhiqiu): add more function in base class
class ScopeBase {
public:
/// Find a variable in the scope or any of its ancestors. Returns
/// nullptr if cannot find.
/// Caller doesn't own the returned Variable.
virtual Variable* FindVar(const std::string& name) const = 0;
virtual ~ScopeBase() {}
};

class Scope;

/**
Expand All @@ -49,7 +59,7 @@ class Scope;
* One net can run in different scopes and update different variable in the
* scope.
*/
class Scope {
class Scope : public ScopeBase {
public:
Scope() {}
~Scope();
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/var_type_traits.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class BKCLCommunicator;

namespace framework {
class LoDRankTable;
class ScopeBase;
class LoDTensor;
class ReaderHolder;
class Scope;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -256,10 +256,12 @@ def build_program(self):
main_program = paddle.static.Program()
startup_program = paddle.static.Program()
with paddle.static.program_guard(main_program, startup_program):
w = paddle.rand([10, 20])
w = paddle.rand([10, 3])
ids = paddle.static.data(name="id", shape=[5], dtype='int64')
data = paddle.static.data(name="data", shape=[3], dtype='float32')
emb = paddle.nn.functional.embedding(
x=ids, weight=w, sparse=False, name="embedding")
emb = emb + data

return main_program, startup_program, emb

Expand All @@ -273,7 +275,7 @@ def _run(self, feeds):

for feed in feeds:
out = exe.run(main_program, feed=feed, fetch_list=fetch_vars)

print(out)
return out

def run_new_executor(self, feed):
Expand All @@ -284,12 +286,27 @@ def run_new_executor(self, feed):

def test_exception(self):
feed = [{
'id': np.array([1, 2, 3, 4, 5]).astype(np.int64)
'id': np.array([1, 2, 3, 4, 5]).astype(np.int64),
'data': np.array([1, 2, 3, 4]).astype(np.float32),
}, {
'id': np.array([1, 2, 3, 4, 11]).astype(np.int64)
'id': np.array([1, 2, 3, 4, 11]).astype(np.int64),
'data': np.array([1, 2, 3, 4]).astype(np.float32),
}]
self.assertRaises(ValueError, self.run_new_executor, feed)

def test_nan(self):
flags = {'FLAGS_check_nan_inf': True}
paddle.fluid.set_flags(flags)
feed = [{
'id': np.array([1, 2, 3, 4, 5]).astype(np.int64),
'data': np.array([1, 2, 3]).astype(np.float32),
}, {
'id': np.array([1, 2, 3, 4, 5]).astype(np.int64),
'data': np.array([1, 2, 3]).astype(np.float32),
}]
feed[1]['data'][0] = np.nan
self.assertRaises(RuntimeError, self.run_new_executor, feed)


if __name__ == "__main__":
unittest.main()