Skip to content

Commit

Permalink
add cinn_launch_op for using CINN to optimize graph (#36600)
Browse files Browse the repository at this point in the history
增加CinnLaunchOp,负责执行Cinn子图编译的结果,要点如下:
1. 在子图划分的BuildCinnPass中,每个子图在原图中会被替换为该CinnLaunchOp,由它来调用Cinn进行子图编译、执行的功能。
2. CinnLaunchOp的输入/输出即为子图的输入和输出,另外增加`compilation_key`属性,它可由该属性key从全局Cache中获取子图对象、编译结果,该属性由BuildCinnPass在创建Op时进行设置
3. CinnLaunchOp功能实现的流程为:
        - 从全局Cache中获取子图对象
        - 从全局Cache中获取子图编译结果,未命中cache时进行即时编译
        - 根据编译结果的变量信息(数据类型、shape)初始化运行时数据,分配内存/显存
        - 将运行时数据打包为参数,调用cinn的可执行对象runtime program进行计算
        - 子图运行结果通过参数指针同步到paddle侧的tensor
  • Loading branch information
CtfGo authored Nov 1, 2021
1 parent 8937205 commit 0a963ee
Show file tree
Hide file tree
Showing 10 changed files with 981 additions and 5 deletions.
9 changes: 6 additions & 3 deletions paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,15 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
<< cinn_graph->Visualize();
ApplyPass(cinn_graph.get(), "OpFusion");
auto scope = BuildScope(target, cinn_graph);
GraphCompiler graph_compiler(target, scope, cinn_graph);

auto graph_compiler =
std::make_unique<GraphCompiler>(target, scope, cinn_graph);
GraphCompiler::CompileOptions options;
options.with_instantiate_variables = false;
auto compiled_res = graph_compiler.Build(options);
auto compiled_res = graph_compiler->Build(options);
auto compiled_obj = std::make_unique<CinnCompiledObject>();
*compiled_obj = {std::move(compiled_res.runtime_program), scope,
*compiled_obj = {std::move(graph_compiler),
std::move(compiled_res.runtime_program), scope,
symbol.var_model_to_program_map()};
return compiled_obj;
}
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/paddle2cinn/cinn_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ namespace framework {
namespace paddle2cinn {

struct CinnCompiledObject {
std::unique_ptr<::cinn::hlir::framework::GraphCompiler> compiler;
std::unique_ptr<::cinn::hlir::framework::Program> runtime_program;
std::shared_ptr<::cinn::hlir::framework::Scope> scope;
std::unordered_map<std::string, std::string> paddle2cinn_varmap;
Expand Down
13 changes: 11 additions & 2 deletions paddle/fluid/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ if(WITH_UNITY_BUILD)
include(unity_build_rule.cmake)
endif()

register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op
recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op
recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op cinn_launch_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})

op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
op_library(save_combine_op DEPS string_array)
Expand Down Expand Up @@ -166,6 +166,15 @@ if (WITH_ASCEND_CL)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} npu_op_runner)
endif()

if (WITH_CINN)
cc_library(cinn_launch_op_helper SRCS cinn_launch_op_helper.cc DEPS operator cinn)
cc_test(cinn_launch_op_helper_test SRCS cinn_launch_op_helper_test.cc DEPS cinn_launch_op_helper)
op_library(cinn_launch_op SRCS cinn_launch_op.cc cinn_launch_op.cu.cc DEPS cinn_compiler cinn_launch_op_helper cinn ${OP_HEADER_DEPS})
if (WITH_GPU)
nv_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op elementwise_add_op)
endif()
endif()

# FIXME(typhoonzero): operator deps may not needed.
# op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
# op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
Expand Down
105 changes: 105 additions & 0 deletions paddle/fluid/operators/cinn_launch_op.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/operators/cinn_launch_op.h"

namespace paddle {
namespace operators {

class CinnLaunchOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;

void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs(kX), "Input", kX, "CinnLaunchOp");
OP_INOUT_CHECK(ctx->HasOutput(kOutputs), "Output", kOutputs,
"CinnLaunchOp");
}

protected:
/* [Why use single type kernel]:
*
* This op is similar to a control flow op, it doses not need
* a op kernel, but in order to make it execute under dynamic
* graph mode, implement it with op kernel.
*
* So whether the kernel data type is int, float or other type,
* which has no effect on its execution logic, so directly
* specified a data type here.
*
* Of course, the data type here is also not important.
*/

framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(framework::proto::VarType::FP32,
ctx.GetPlace());
}
};

class CinnLaunchOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(kX,
"(vector<LoDTensor>)"
"which are the input of graph inside the CinnLaunchOp.")
.AsDuplicable();
AddOutput(kOutputs,
"(vector<LoDTensor>)"
"which are the output of graph inside the CinnLaunchOp.")
.AsDuplicable();
AddAttr<std::string>(
kCompilationKey,
"(string)"
"a hash key used to get the graph object or its computation result.");
AddComment(R"DOC(
CinnLaunch Operator.
This operator is used to launch CINN(https://github.com/PaddlePaddle/CINN/blob/develop/README.md)
to compile a graph and execute the compiled object.
Both input and output of this operator are a set of variables
which are input and output of the graph respectively that will be
compiled and executed in this operator.
In addition, there is an attribute named 'compilation_key' should be
set necessarily to get corresponding ir::Graph object of the graph
or its computation result.
It accomplishs the computation of graph following several steps:
1. Fetch ir::Graph object from CinnCompiler using kCompilationKey
2. Compile the graph to a compiled object, and insert it to the
global cache so that we can directly query it from this cache next time
when shape of input variables are not changed at all.
3. Create and instantiate all variables used to execute compiled runtime program
if necessary according to the info(type,shape) included in the return scope.
4. Pack each tensor buffer of all above variables as execution arguments.
5. Launch execution of the runtime program with above arguments, then
the result would be output by writing value on underlying buffer address.
)DOC");
}
};

} // namespace operators
} // namespace paddle

namespace ops = paddle::operators;
REGISTER_OPERATOR(
cinn_launch, ops::CinnLaunchOp, ops::CinnLaunchOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
/* see [Why use single type kernel] */
REGISTER_OP_CPU_KERNEL(
cinn_launch,
ops::CinnLaunchOpKernel<paddle::platform::CPUDeviceContext, float>);
20 changes: 20 additions & 0 deletions paddle/fluid/operators/cinn_launch_op.cu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/cinn_launch_op.h"

/* see [Why use single type kernel] */
REGISTER_OP_CUDA_KERNEL(cinn_launch,
paddle::operators::CinnLaunchOpKernel<
paddle::platform::CUDADeviceContext, float>);
114 changes: 114 additions & 0 deletions paddle/fluid/operators/cinn_launch_op.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <memory>
#include <string>
#include <unordered_map>
#include "cinn/hlir/framework/graph_compiler.h"
#include "cinn/hlir/framework/scope.h"
#include "cinn/runtime/cinn_runtime.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
#include "paddle/fluid/operators/cinn_launch_op_helper.h"
#include "paddle/fluid/string/string_helper.h"

namespace paddle {
namespace operators {

static constexpr char kX[] = "X";
static constexpr char kOutputs[] = "Out";
static constexpr char kCompilationKey[] = "compilation_key";

using LoDTensor = framework::LoDTensor;
using Name2ConstTensor = std::map<std::string, const LoDTensor*>;
using CinnTensor = cinn::hlir::framework::Tensor;
using Name2CinnTensor = std::unordered_map<std::string, CinnTensor>;
using framework::paddle2cinn::CinnCompiler;

template <typename DeviceContext, typename T>
class CinnLaunchOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
// Step 1. Find graph object and prepare input
PADDLE_ENFORCE_EQ(ctx.HasAttr(kCompilationKey), true,
platform::errors::NotFound(
"No Attribute(%s) found for CinnLaunchOp operator.",
kCompilationKey));
const auto& compilation_key =
ctx.template Attr<std::string>(kCompilationKey);
VLOG(2) << "CinnLaunchOp compilation_key:" << compilation_key;

const auto& graph = CinnCompiler::GetInstance()->FindGraph(compilation_key);
auto input_variable_names = ctx.InputNames(kX);
Name2ConstTensor input_tensors =
details::GetConstTensors(ctx.scope(), input_variable_names);

// Step 2. Get compilation result of the graph
auto target = details::PlaceToCinnTarget(ctx.GetPlace());
const auto& cinn_compiled_object =
CinnCompiler::GetInstance()->Compile(graph, input_tensors, target);
VLOG(2) << "CinnLaunchOp compile graph done on " << ctx.GetPlace();

const auto& cinn_runtime_program = cinn_compiled_object.runtime_program;
const auto& compiled_scope = *(cinn_compiled_object.scope.get());
const auto& paddle2cinn_varmap = cinn_compiled_object.paddle2cinn_varmap;

// Step 3. Initialize all variables of the compilation runtime program
// in paddle, and pack them into execution arguments
VLOG(2) << "CinnLaunchOp prepare execution arguments";
std::map<std::string, cinn_pod_value_t> name2argument;
std::vector<std::unique_ptr<cinn_buffer_t>> hold_buffers;
// prepare input variables
Name2CinnTensor input_compiled_tensors = details::GetCompiledTensors(
input_variable_names, compiled_scope, paddle2cinn_varmap);
details::CheckTensorEquivalent(input_tensors, input_compiled_tensors);
details::AppendExecutionArguments(ctx.scope(), input_variable_names,
paddle2cinn_varmap, &name2argument,
&hold_buffers);
// prepare output variables
auto output_variable_names = ctx.OutputNames(kOutputs);
Name2CinnTensor output_compiled_tensors = details::GetCompiledTensors(
output_variable_names, compiled_scope, paddle2cinn_varmap);
details::InitializeOutputVar(ctx.scope(), ctx.GetPlace(),
output_compiled_tensors);
Name2ConstTensor output_tensors =
details::GetConstTensors(ctx.scope(), output_variable_names);
details::CheckTensorEquivalent(output_tensors, output_compiled_tensors);
details::AppendExecutionArguments(ctx.scope(), output_variable_names,
paddle2cinn_varmap, &name2argument,
&hold_buffers);
// prepare temporary variables
auto temp_variable_names =
details::SeperateTempVar(compiled_scope, paddle2cinn_varmap,
input_variable_names, output_variable_names);
auto temp_scope = ctx.scope().NewTmpScope();
if (!temp_variable_names.empty()) {
details::InitializeTempVar(temp_variable_names, compiled_scope,
ctx.GetPlace(), temp_scope.get());
details::AppendExecutionArguments(*temp_scope, temp_variable_names,
paddle2cinn_varmap, &name2argument,
&hold_buffers);
}
// Step 4. Launch CINN to execute the compilation runtime program
cinn_runtime_program->Execute(&name2argument);
VLOG(2) << "CinnLaunchOp launch runtime_program execution done.";
}
};

} // namespace operators
} // namespace paddle
Loading

0 comments on commit 0a963ee

Please sign in to comment.