Skip to content

Commit

Permalink
cinn_launch_op: optimize the overhead of preparing variables before e…
Browse files Browse the repository at this point in the history
…xecuting cinn compiled program (#41777)

* optimize preparation overhead before executing cinn compiled program

* update code notes

* fix flag annotation

* add a flag of auto-tune feature beforehand
  • Loading branch information
CtfGo authored Apr 18, 2022
1 parent 8f469dd commit 2d4fe16
Show file tree
Hide file tree
Showing 7 changed files with 151 additions and 60 deletions.
34 changes: 34 additions & 0 deletions paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,13 @@
#include "cinn/hlir/framework/graph_compiler.h"
#include "cinn/hlir/framework/pass.h"
#include "cinn/hlir/pass/use_pass.h"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
#include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/tensor.h"
Expand All @@ -45,6 +47,7 @@
#include "paddle/fluid/string/string_helper.h"
#include "paddle/phi/core/utils/rw_lock.h"

DECLARE_bool(enable_pe_launch_cinn);
namespace paddle {
namespace framework {
namespace paddle2cinn {
Expand Down Expand Up @@ -217,6 +220,33 @@ void CinnCompiler::Clear() {
real_compiled_num_.store(0);
}

void CinnCompiler::CheckCompiledValid(
const ir::Graph& graph,
const std::map<std::string, const LoDTensor*>& input_tensors,
const CinnCompiledObject& compiled_obj) const {
const auto& input_var_names = graph.Get<std::vector<std::string>>(kInputVars);
const auto& output_var_names =
graph.Get<std::vector<std::string>>(kOutputVars);
auto* launch_context = compiled_obj.launch_context.get();
// 1. check all of the output variables will be assigned by compiled program
for (auto&& var_name : output_var_names) {
PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true,
platform::errors::PreconditionNotMet(
"Variable(%s) not applied in CINN", var_name));
}
// 2. check all of the used input variables were correctly deduced by CINN.
for (const auto& var_name : input_var_names) {
// some input variables were not used by CINN because they were eliminated
// by its optimized passes or some operators of it need less inputs
if (!launch_context->IsVariableUsed(var_name)) {
VLOG(4) << "Input variable" << var_name << " not used by cinn";
continue;
}
launch_context->CheckTensorEquivalent(var_name,
*input_tensors.at(var_name));
}
}

std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
const ir::Graph& graph,
const std::map<std::string, const LoDTensor*>& input_tensors,
Expand Down Expand Up @@ -244,6 +274,9 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
std::make_unique<GraphCompiler>(target, scope, cinn_graph);
GraphCompiler::CompileOptions options;
options.with_instantiate_variables = false;
if (!FLAGS_enable_pe_launch_cinn) {
options.with_buffer_handle_instruction_inserted = true;
}
auto compiled_res =
graph_compiler->Build(options, std::move(fetch_ids), stream);
auto compiled_obj = std::make_unique<CinnCompiledObject>();
Expand All @@ -254,6 +287,7 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
compiled_obj->launch_context =
std::make_unique<operators::details::CinnLaunchContext>(graph,
*compiled_obj);
CheckCompiledValid(graph, input_tensors, *compiled_obj);
return compiled_obj;
}

Expand Down
7 changes: 7 additions & 0 deletions paddle/fluid/framework/paddle2cinn/cinn_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,13 @@ class CinnCompiler {
const ::cinn::common::Target& target, std::int64_t compiled_num,
void* stream = nullptr) const;

// check whether a compiled result is valid by comparing
// the consistency of external variables of the subgraph
void CheckCompiledValid(
const ir::Graph& graph,
const std::map<std::string, const LoDTensor*>& input_tensors,
const CinnCompiledObject& compiled_obj) const;

std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
std::unordered_map<CinnCacheKeyByAddress, std::int64_t, CinnCacheKey::Hash>
cache_by_address_;
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/cinn/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ include(operators)
cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context)
cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor transform_type cinn)

SET(CINN_OP_DEPS parallel_executor string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type)
SET(CINN_OP_DEPS parallel_executor string_helper variable_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type)
register_operators(DEPS ${CINN_OP_DEPS})

if (WITH_TESTING)
Expand Down
25 changes: 16 additions & 9 deletions paddle/fluid/operators/cinn/cinn_launch_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "paddle/fluid/framework/paddle2cinn/transform_type.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/cinn/cinn_op_helper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
Expand Down Expand Up @@ -69,13 +70,6 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
graph.Get<std::vector<std::string>>(framework::paddle2cinn::kOutputVars);
internal_var_names_ =
ExtractInternalVarNames(input_var_names, output_var_names);
// check completeness of output variables in compiled result
for (auto&& var_name : output_var_names) {
PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
platform::errors::PreconditionNotMet(
"Variable(%s) not applied in CINN", var_name));
}

// initialize all execution arguments
InitializeArguments();
// DEPRECATED(CtfGo): following callback assignment will be deprecated soon
Expand Down Expand Up @@ -235,7 +229,7 @@ void CinnLaunchContext::InitializeArguments() {
cinn_tensor->shape().data().size());
cinn_buffer->type = cinn::runtime::ToRuntimeType(cinn_tensor->type());
VLOG(4) << string::Sprintf(
"Append an argument:name(%s),dims(%s),type(%s)",
"Append an argument:name(%s),dims(%s),type(%s)", arg,
framework::DDim(cinn_buffer->dims, cinn_buffer->dimensions).to_str(),
cinn_tensor->type());
name2argument_.emplace(arg, cinn_buffer.get());
Expand Down Expand Up @@ -400,7 +394,20 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
std::unordered_map<Scope*, Scope*> scope_map = {
{parallel_executor_->GetLocalScopes().front(), scope}};
parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map);
parallel_executor_->PrepareVariables(scope);
// instead of using the PrepareVariables function of ParallelExecutor to
// initialize all variables, here we only initialize internal variables
// because external variables are already included in parent scope.
for (auto&& var_name : internal_var_names_) {
auto* var = scope->FindVar(var_name);
if (var != nullptr) {
VLOG(5) << "internal variable:" << var_name
<< " has been initialized beforehand in global scope, skipped.";
continue;
}
framework::InitializeVariable(scope->Var(var_name),
framework::proto::VarType::LOD_TENSOR);
}

for (auto&& var_name : initialized_beforehand_vars_) {
auto* var = scope->GetVar(var_name);
auto* buffer = GetCinnBufferOfVar(var_name);
Expand Down
40 changes: 16 additions & 24 deletions paddle/fluid/operators/cinn/cinn_launch_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,17 @@
#include <string>
#include <unordered_map>
#include <unordered_set>

#include "cinn/common/target.h"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
#include "paddle/fluid/operators/cinn/cinn_launch_context.h"
#include "paddle/fluid/operators/cinn/cinn_op_helper.h"

DECLARE_bool(enable_pe_launch_cinn);
namespace paddle {
namespace operators {

Expand Down Expand Up @@ -101,34 +104,23 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile(
compilation_key, inputs_name2tensor, target, stream);
details::DebugCinnCompiledResult(cinn_compiled_object);

auto* launch_context = cinn_compiled_object.launch_context.get();
// Step 3. check the computational consistency of the subgraph
// before and after the compilation
// 3.1 Input variables: tensors of input variables have
// been initialized before graph compiled, just check the
// equiality between tensors of paddle and cinn.
for (const auto& var_name : input_x_variable_names) {
// some input variables don't need for cinn because they are
// eliminated by optimized passes or some cinn operators use
// less variables
if (!launch_context->IsVariableUsed(var_name)) {
VLOG(4) << "Input variable" << var_name << " not used by cinn";
continue;
}
launch_context->CheckTensorEquivalent(var_name,
*inputs_name2tensor.at(var_name));
}

// Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
// Step 3. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
details::SetCinnRuntimeFlags();

// Step 5. use PE to execute the compiled CINN instructions
// in nodes of the runtime graph
VLOG(4) << "Execute the runtime graph by PE";
framework::Scope& exec_scope = scope.NewScope();
auto* pe = launch_context->InitializePE(place, &exec_scope);
pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
// Step 4. Execute the compiled CINN instructions by a PE or
// by the CINN compiled program in sequential order
if (FLAGS_enable_pe_launch_cinn) {
VLOG(4) << "Execute the runtime graph by PE";
framework::Scope& exec_scope = scope.NewScope();
auto* pe = launch_context->InitializePE(place, &exec_scope);
pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
} else {
VLOG(4) << "Execute the compiled executable program";
launch_context->UpdateCapturedEnv(scope, place);
LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
}
VLOG(4) << "CinnLaunchOp launch execution done.";
}
};
Expand Down
77 changes: 51 additions & 26 deletions paddle/fluid/operators/cinn/cinn_launch_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ USE_OP(cinn_launch);
USE_OP(cinn_instruction_run);
USE_OP_ITSELF(elementwise_add);
DECLARE_double(eager_delete_tensor_gb);
DECLARE_bool(enable_pe_launch_cinn);

PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
#ifdef PADDLE_WITH_CUDA
Expand All @@ -42,43 +43,67 @@ namespace paddle::operators {

using framework::paddle2cinn::CinnCompiler;

TEST(CinnLaunchOpTest, TestWithElementwiseAdd) {
paddle::framework::InitDevices();
platform::SetNumThreads(1);
// cache test graph into CinnCompiler
const std::string& test_op_out_name = "cinn_launch_op_out";
const std::string& add_op_out_name = "add_op_out";
auto compilation_key = CinnCompiler::GetInstance()->AddGraph(
CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name));

// create cinn_launch_op and elementwise_add op
auto cinn_launch_op = paddle::framework::OpRegistry::CreateOp(
"cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}},
{{"compilation_key", compilation_key}});
auto elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
"elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}},
{{"Out", {add_op_out_name}}}, {{}});

// Run ops and check the computation results
auto run_and_check_fn = [&](const platform::Place& place) {
class TestCinnLaunchOp : public ::testing::Test {
public:
const char* test_op_out_name = "add_op_out";
const char* add_op_out_name = "add_op_out";
std::unique_ptr<framework::OperatorBase> cinn_launch_op;
std::unique_ptr<framework::OperatorBase> elementwise_add_op;

void SetUp() override {
paddle::framework::InitDevices();
platform::SetNumThreads(1);
// cache test graph into CinnCompiler
auto compilation_key = CinnCompiler::GetInstance()->AddGraph(
CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name));

// create cinn_launch_op and elementwise_add op
cinn_launch_op = paddle::framework::OpRegistry::CreateOp(
"cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}},
{{"compilation_key", compilation_key}});
elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
"elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}},
{{"Out", {add_op_out_name}}}, {{}});
}

void RunAndCheck(const platform::Place& place) {
// Run ops and check the computation results
framework::Scope scope;
InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, &scope);
scope.Var(test_op_out_name)->GetMutable<LoDTensor>();
scope.Var(add_op_out_name)->GetMutable<LoDTensor>();
cinn_launch_op->Run(scope, place);
elementwise_add_op->Run(scope, place);
cinn_launch_op->Run(scope, place);
CompareOpResult<float>(scope.GetVar(test_op_out_name),
scope.GetVar(add_op_out_name));
};
FLAGS_eager_delete_tensor_gb = -1;
}

void TearDown() override { CinnCompiler::GetInstance()->Clear(); }
};

TEST_F(TestCinnLaunchOp, TestRunInstructionByPE) {
// CPU
run_and_check_fn(platform::CPUPlace());
run_and_check_fn(platform::CPUPlace());
RunAndCheck(platform::CPUPlace());
// the second run on the same place is to check the cache logic
RunAndCheck(platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA
// GPU
RunAndCheck(platform::CUDAPlace());
RunAndCheck(platform::CUDAPlace());
#endif
}

TEST_F(TestCinnLaunchOp, TestRunInstructionByCinnProgram) {
// set FLAGS_enable_pe_launch_cinn=false to switch to use
// default scheduler of CINN to execute the compiled program
FLAGS_enable_pe_launch_cinn = false;

RunAndCheck(platform::CPUPlace());
RunAndCheck(platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA
// GPU
run_and_check_fn(platform::CUDAPlace());
run_and_check_fn(platform::CUDAPlace());
RunAndCheck(platform::CUDAPlace());
RunAndCheck(platform::CUDAPlace());
#endif
}

Expand Down
26 changes: 26 additions & 0 deletions paddle/fluid/platform/flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,32 @@ PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops, "",
*/
PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops, "",
"It controls the cinn op subset to be not used.");

/*
* CINN related FLAG
* Name: FLAGS_enable_pe_launch_cinn
* Since Version: 2.3
* Value Range: bool, default=true
* Example: FLAGS_enable_pe_launch_cinn=true would execute the CINN compiled
* instructions of a paddle graph with ParallelExecutor, otherwise with the
* CINN compiled runtime program in sequential order.
*/
PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn, true,
"It controls whether to execute cinn compiled "
"program with ParallelExecutor");

/*
* CINN related FLAG
* Name: FLAGS_enable_cinn_auto_tune
* Since Version: 2.3
* Value Range: bool, default=false
* Example: FLAGS_enable_cinn_auto_tune=true would use CINN with its
* auto-tune feature enabled
*/
PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune, false,
"It controls whether to use cinn with "
"its auto-tune feature enabled");

#endif

DEFINE_int32(record_pool_max_size, 2000000,
Expand Down

0 comments on commit 2d4fe16

Please sign in to comment.