Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cherry-pick ] cinn_launch_op: optimize the overhead of preparing variables (#41777) #41910

Merged
merged 1 commit into from
Apr 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,13 @@
#include "cinn/hlir/framework/graph_compiler.h"
#include "cinn/hlir/framework/pass.h"
#include "cinn/hlir/pass/use_pass.h"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
#include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/tensor.h"
Expand All @@ -45,6 +47,7 @@
#include "paddle/fluid/string/string_helper.h"
#include "paddle/phi/core/utils/rw_lock.h"

DECLARE_bool(enable_pe_launch_cinn);
namespace paddle {
namespace framework {
namespace paddle2cinn {
Expand Down Expand Up @@ -217,6 +220,33 @@ void CinnCompiler::Clear() {
real_compiled_num_.store(0);
}

void CinnCompiler::CheckCompiledValid(
const ir::Graph& graph,
const std::map<std::string, const LoDTensor*>& input_tensors,
const CinnCompiledObject& compiled_obj) const {
const auto& input_var_names = graph.Get<std::vector<std::string>>(kInputVars);
const auto& output_var_names =
graph.Get<std::vector<std::string>>(kOutputVars);
auto* launch_context = compiled_obj.launch_context.get();
// 1. check all of the output variables will be assigned by compiled program
for (auto&& var_name : output_var_names) {
PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true,
platform::errors::PreconditionNotMet(
"Variable(%s) not applied in CINN", var_name));
}
// 2. check all of the used input variables were correctly deduced by CINN.
for (const auto& var_name : input_var_names) {
// some input variables were not used by CINN because they were eliminated
// by its optimized passes or some operators of it need less inputs
if (!launch_context->IsVariableUsed(var_name)) {
VLOG(4) << "Input variable" << var_name << " not used by cinn";
continue;
}
launch_context->CheckTensorEquivalent(var_name,
*input_tensors.at(var_name));
}
}

std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
const ir::Graph& graph,
const std::map<std::string, const LoDTensor*>& input_tensors,
Expand Down Expand Up @@ -244,6 +274,9 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
std::make_unique<GraphCompiler>(target, scope, cinn_graph);
GraphCompiler::CompileOptions options;
options.with_instantiate_variables = false;
if (!FLAGS_enable_pe_launch_cinn) {
options.with_buffer_handle_instruction_inserted = true;
}
auto compiled_res =
graph_compiler->Build(options, std::move(fetch_ids), stream);
auto compiled_obj = std::make_unique<CinnCompiledObject>();
Expand All @@ -254,6 +287,7 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
compiled_obj->launch_context =
std::make_unique<operators::details::CinnLaunchContext>(graph,
*compiled_obj);
CheckCompiledValid(graph, input_tensors, *compiled_obj);
return compiled_obj;
}

Expand Down
7 changes: 7 additions & 0 deletions paddle/fluid/framework/paddle2cinn/cinn_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,13 @@ class CinnCompiler {
const ::cinn::common::Target& target, std::int64_t compiled_num,
void* stream = nullptr) const;

// check whether a compiled result is valid by comparing
// the consistency of external variables of the subgraph
void CheckCompiledValid(
const ir::Graph& graph,
const std::map<std::string, const LoDTensor*>& input_tensors,
const CinnCompiledObject& compiled_obj) const;

std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
std::unordered_map<CinnCacheKeyByAddress, std::int64_t, CinnCacheKey::Hash>
cache_by_address_;
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/cinn/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ include(operators)
cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context)
cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor transform_type cinn)

SET(CINN_OP_DEPS parallel_executor string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type)
SET(CINN_OP_DEPS parallel_executor string_helper variable_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type)
register_operators(DEPS ${CINN_OP_DEPS})

if (WITH_TESTING)
Expand Down
25 changes: 16 additions & 9 deletions paddle/fluid/operators/cinn/cinn_launch_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "paddle/fluid/framework/paddle2cinn/transform_type.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
Expand Down Expand Up @@ -69,13 +70,6 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
graph.Get<std::vector<std::string>>(framework::paddle2cinn::kOutputVars);
internal_var_names_ =
ExtractInternalVarNames(input_var_names, output_var_names);
// check completeness of output variables in compiled result
for (auto&& var_name : output_var_names) {
PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
platform::errors::PreconditionNotMet(
"Variable(%s) not applied in CINN", var_name));
}

// initialize all execution arguments
InitializeArguments();
// DEPRECATED(CtfGo): following callback assignment will be deprecated soon
Expand Down Expand Up @@ -235,7 +229,7 @@ void CinnLaunchContext::InitializeArguments() {
cinn_tensor->shape().data().size());
cinn_buffer->type = cinn::runtime::ToRuntimeType(cinn_tensor->type());
VLOG(4) << string::Sprintf(
"Append an argument:name(%s),dims(%s),type(%s)",
"Append an argument:name(%s),dims(%s),type(%s)", arg,
framework::DDim(cinn_buffer->dims, cinn_buffer->dimensions).to_str(),
cinn_tensor->type());
name2argument_.emplace(arg, cinn_buffer.get());
Expand Down Expand Up @@ -400,7 +394,20 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
std::unordered_map<Scope*, Scope*> scope_map = {
{parallel_executor_->GetLocalScopes().front(), scope}};
parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map);
parallel_executor_->PrepareVariables(scope);
// instead of using the PrepareVariables function of ParallelExecutor to
// initialize all variables, here we only initialize internal variables
// because external variables are already included in parent scope.
for (auto&& var_name : internal_var_names_) {
auto* var = scope->FindVar(var_name);
if (var != nullptr) {
VLOG(5) << "internal variable:" << var_name
<< " has been initialized beforehand in global scope, skipped.";
continue;
}
framework::InitializeVariable(scope->Var(var_name),
framework::proto::VarType::LOD_TENSOR);
}

for (auto&& var_name : initialized_beforehand_vars_) {
auto* var = scope->GetVar(var_name);
auto* buffer = GetCinnBufferOfVar(var_name);
Expand Down
40 changes: 16 additions & 24 deletions paddle/fluid/operators/cinn/cinn_launch_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,17 @@
#include <string>
#include <unordered_map>
#include <unordered_set>

#include "cinn/common/target.h"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
#include "paddle/fluid/operators/cinn/cinn_launch_context.h"
#include "paddle/fluid/operators/cinn/cinn_op_helper.h"

DECLARE_bool(enable_pe_launch_cinn);
namespace paddle {
namespace operators {

Expand Down Expand Up @@ -101,34 +104,23 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile(
compilation_key, inputs_name2tensor, target, stream);
details::DebugCinnCompiledResult(cinn_compiled_object);

auto* launch_context = cinn_compiled_object.launch_context.get();
// Step 3. check the computational consistency of the subgraph
// before and after the compilation
// 3.1 Input variables: tensors of input variables have
// been initialized before graph compiled, just check the
// equiality between tensors of paddle and cinn.
for (const auto& var_name : input_x_variable_names) {
// some input variables don't need for cinn because they are
// eliminated by optimized passes or some cinn operators use
// less variables
if (!launch_context->IsVariableUsed(var_name)) {
VLOG(4) << "Input variable" << var_name << " not used by cinn";
continue;
}
launch_context->CheckTensorEquivalent(var_name,
*inputs_name2tensor.at(var_name));
}

// Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
// Step 3. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
details::SetCinnRuntimeFlags();

// Step 5. use PE to execute the compiled CINN instructions
// in nodes of the runtime graph
VLOG(4) << "Execute the runtime graph by PE";
framework::Scope& exec_scope = scope.NewScope();
auto* pe = launch_context->InitializePE(place, &exec_scope);
pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
// Step 4. Execute the compiled CINN instructions by a PE or
// by the CINN compiled program in sequential order
if (FLAGS_enable_pe_launch_cinn) {
VLOG(4) << "Execute the runtime graph by PE";
framework::Scope& exec_scope = scope.NewScope();
auto* pe = launch_context->InitializePE(place, &exec_scope);
pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
} else {
VLOG(4) << "Execute the compiled executable program";
launch_context->UpdateCapturedEnv(scope, place);
LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
}
VLOG(4) << "CinnLaunchOp launch execution done.";
}
};
Expand Down
77 changes: 51 additions & 26 deletions paddle/fluid/operators/cinn/cinn_launch_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ USE_OP(cinn_launch);
USE_OP(cinn_instruction_run);
USE_OP_ITSELF(elementwise_add);
DECLARE_double(eager_delete_tensor_gb);
DECLARE_bool(enable_pe_launch_cinn);

PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
#ifdef PADDLE_WITH_CUDA
Expand All @@ -42,43 +43,67 @@ namespace paddle::operators {

using framework::paddle2cinn::CinnCompiler;

TEST(CinnLaunchOpTest, TestWithElementwiseAdd) {
paddle::framework::InitDevices();
platform::SetNumThreads(1);
// cache test graph into CinnCompiler
const std::string& test_op_out_name = "cinn_launch_op_out";
const std::string& add_op_out_name = "add_op_out";
auto compilation_key = CinnCompiler::GetInstance()->AddGraph(
CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name));

// create cinn_launch_op and elementwise_add op
auto cinn_launch_op = paddle::framework::OpRegistry::CreateOp(
"cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}},
{{"compilation_key", compilation_key}});
auto elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
"elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}},
{{"Out", {add_op_out_name}}}, {{}});

// Run ops and check the computation results
auto run_and_check_fn = [&](const platform::Place& place) {
class TestCinnLaunchOp : public ::testing::Test {
public:
const char* test_op_out_name = "add_op_out";
const char* add_op_out_name = "add_op_out";
std::unique_ptr<framework::OperatorBase> cinn_launch_op;
std::unique_ptr<framework::OperatorBase> elementwise_add_op;

void SetUp() override {
paddle::framework::InitDevices();
platform::SetNumThreads(1);
// cache test graph into CinnCompiler
auto compilation_key = CinnCompiler::GetInstance()->AddGraph(
CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name));

// create cinn_launch_op and elementwise_add op
cinn_launch_op = paddle::framework::OpRegistry::CreateOp(
"cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}},
{{"compilation_key", compilation_key}});
elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
"elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}},
{{"Out", {add_op_out_name}}}, {{}});
}

void RunAndCheck(const platform::Place& place) {
// Run ops and check the computation results
framework::Scope scope;
InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, &scope);
scope.Var(test_op_out_name)->GetMutable<LoDTensor>();
scope.Var(add_op_out_name)->GetMutable<LoDTensor>();
cinn_launch_op->Run(scope, place);
elementwise_add_op->Run(scope, place);
cinn_launch_op->Run(scope, place);
CompareOpResult<float>(scope.GetVar(test_op_out_name),
scope.GetVar(add_op_out_name));
};
FLAGS_eager_delete_tensor_gb = -1;
}

void TearDown() override { CinnCompiler::GetInstance()->Clear(); }
};

TEST_F(TestCinnLaunchOp, TestRunInstructionByPE) {
// CPU
run_and_check_fn(platform::CPUPlace());
run_and_check_fn(platform::CPUPlace());
RunAndCheck(platform::CPUPlace());
// the second run on the same place is to check the cache logic
RunAndCheck(platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA
// GPU
RunAndCheck(platform::CUDAPlace());
RunAndCheck(platform::CUDAPlace());
#endif
}

TEST_F(TestCinnLaunchOp, TestRunInstructionByCinnProgram) {
// set FLAGS_enable_pe_launch_cinn=false to switch to use
// default scheduler of CINN to execute the compiled program
FLAGS_enable_pe_launch_cinn = false;

RunAndCheck(platform::CPUPlace());
RunAndCheck(platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA
// GPU
run_and_check_fn(platform::CUDAPlace());
run_and_check_fn(platform::CUDAPlace());
RunAndCheck(platform::CUDAPlace());
RunAndCheck(platform::CUDAPlace());
#endif
}

Expand Down
26 changes: 26 additions & 0 deletions paddle/fluid/platform/flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,32 @@ PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops, "",
*/
PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops, "",
"It controls the cinn op subset to be not used.");

/*
* CINN related FLAG
* Name: FLAGS_enable_pe_launch_cinn
* Since Version: 2.3
* Value Range: bool, default=true
* Example: FLAGS_enable_pe_launch_cinn=true would execute the CINN compiled
* instructions of a paddle graph with ParallelExecutor, otherwise with the
* CINN compiled runtime program in sequential order.
*/
PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn, true,
"It controls whether to execute cinn compiled "
"program with ParallelExecutor");

/*
* CINN related FLAG
* Name: FLAGS_enable_cinn_auto_tune
* Since Version: 2.3
* Value Range: bool, default=false
* Example: FLAGS_enable_cinn_auto_tune=true would use CINN with its
* auto-tune feature enabled
*/
PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune, false,
"It controls whether to use cinn with "
"its auto-tune feature enabled");

#endif

DEFINE_int32(record_pool_max_size, 2000000,
Expand Down