add cinn_launch_op for using CINN to optimize graph (#36600)

增加CinnLaunchOp，负责执行Cinn子图编译的结果，要点如下： 1. 在子图划分的BuildCinnPass中，每个子图在原图中会被替换为该CinnLaunchOp，由它来调用Cinn进行子图编译、执行的功能。 2. CinnLaunchOp的输入/输出即为子图的输入和输出，另外增加`compilation_key`属性，它可由该属性key从全局Cache中获取子图对象、编译结果，该属性由BuildCinnPass在创建Op时进行设置 3. CinnLaunchOp功能实现的流程为： - 从全局Cache中获取子图对象 - 从全局Cache中获取子图编译结果，未命中cache时进行即时编译 - 根据编译结果的变量信息(数据类型、shape）初始化运行时数据，分配内存/显存 - 将运行时数据打包为参数，调用cinn的可执行对象runtime program进行计算 - 子图运行结果通过参数指针同步到paddle侧的tensor
PaddlePaddle · Nov 1, 2021 · 0a963ee · 0a963ee
1 parent 8937205
commit 0a963ee
Show file tree

Hide file tree

Showing 10 changed files with 981 additions and 5 deletions.
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -112,12 +112,15 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
           << cinn_graph->Visualize();
   ApplyPass(cinn_graph.get(), "OpFusion");
   auto scope = BuildScope(target, cinn_graph);
-  GraphCompiler graph_compiler(target, scope, cinn_graph);
+
+  auto graph_compiler =
+      std::make_unique<GraphCompiler>(target, scope, cinn_graph);
   GraphCompiler::CompileOptions options;
   options.with_instantiate_variables = false;
-  auto compiled_res = graph_compiler.Build(options);
+  auto compiled_res = graph_compiler->Build(options);
   auto compiled_obj = std::make_unique<CinnCompiledObject>();
-  *compiled_obj = {std::move(compiled_res.runtime_program), scope,
+  *compiled_obj = {std::move(graph_compiler),
+                   std::move(compiled_res.runtime_program), scope,
                    symbol.var_model_to_program_map()};
   return compiled_obj;
 }

diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -33,6 +33,7 @@ namespace framework {
 namespace paddle2cinn {
 
 struct CinnCompiledObject {
+  std::unique_ptr<::cinn::hlir::framework::GraphCompiler> compiler;
   std::unique_ptr<::cinn::hlir::framework::Program> runtime_program;
   std::shared_ptr<::cinn::hlir::framework::Scope> scope;
   std::unordered_map<std::string, std::string> paddle2cinn_varmap;

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
@@ -79,8 +79,8 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 
-register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op 
-        recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
+register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op
+        recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op cinn_launch_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
 op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
 op_library(save_combine_op DEPS string_array)
@@ -166,6 +166,15 @@ if (WITH_ASCEND_CL)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} npu_op_runner)
 endif()
 
+if (WITH_CINN)
+  cc_library(cinn_launch_op_helper SRCS cinn_launch_op_helper.cc DEPS operator cinn)
+  cc_test(cinn_launch_op_helper_test SRCS cinn_launch_op_helper_test.cc DEPS cinn_launch_op_helper)
+  op_library(cinn_launch_op SRCS cinn_launch_op.cc cinn_launch_op.cu.cc DEPS cinn_compiler cinn_launch_op_helper cinn ${OP_HEADER_DEPS})
+  if (WITH_GPU)
+      nv_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op elementwise_add_op)
+  endif()
+endif()
+
 # FIXME(typhoonzero): operator deps may not needed.
 # op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
 # op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)

diff --git a/paddle/fluid/operators/cinn_launch_op.cc b/paddle/fluid/operators/cinn_launch_op.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/cinn_launch_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CinnLaunchOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInputs(kX), "Input", kX, "CinnLaunchOp");
+    OP_INOUT_CHECK(ctx->HasOutput(kOutputs), "Output", kOutputs,
+                   "CinnLaunchOp");
+  }
+
+ protected:
+  /* [Why use single type kernel]:
+   *
+   * This op is similar to a control flow op, it doses not need
+   * a op kernel, but in order to make it execute under dynamic
+   * graph mode, implement it with op kernel.
+   *
+   * So whether the kernel data type is int, float or other type,
+   * which has no effect on its execution logic, so directly
+   * specified a data type here.
+   *
+   * Of course, the data type here is also not important.
+   */
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
+  }
+};
+
+class CinnLaunchOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(kX,
+             "(vector<LoDTensor>)"
+             "which are the input of graph inside the CinnLaunchOp.")
+        .AsDuplicable();
+    AddOutput(kOutputs,
+              "(vector<LoDTensor>)"
+              "which are the output of graph inside the CinnLaunchOp.")
+        .AsDuplicable();
+    AddAttr<std::string>(
+        kCompilationKey,
+        "(string)"
+        "a hash key used to get the graph object or its computation result.");
+    AddComment(R"DOC(
+CinnLaunch Operator.
+
+This operator is used to launch CINN(https://github.com/PaddlePaddle/CINN/blob/develop/README.md)
+to compile a graph and execute the compiled object.
+
+Both input and output of this operator are a set of variables
+which are input and output of the graph respectively that will be
+compiled and executed in this operator.
+In addition, there is an attribute named 'compilation_key' should be
+set necessarily to get corresponding ir::Graph object of the graph
+or its computation result.
+
+It accomplishs the computation of graph following several steps:
+  1. Fetch ir::Graph object from CinnCompiler using kCompilationKey
+  2. Compile the graph to a compiled object, and insert it to the
+     global cache so that we can directly query it from this cache next time
+     when shape of input variables are not changed at all.
+  3. Create and instantiate all variables used to execute compiled runtime program
+     if necessary according to the info(type,shape) included in the return scope.
+  4. Pack each tensor buffer of all above variables as execution arguments.
+  5. Launch execution of the runtime program with above arguments, then
+     the result would be output by writing value on underlying buffer address.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    cinn_launch, ops::CinnLaunchOp, ops::CinnLaunchOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+/* see [Why use single type kernel] */
+REGISTER_OP_CPU_KERNEL(
+    cinn_launch,
+    ops::CinnLaunchOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn_launch_op.cu.cc
@@ -0,0 +1,20 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cinn_launch_op.h"
+
+/* see [Why use single type kernel] */
+REGISTER_OP_CUDA_KERNEL(cinn_launch,
+                        paddle::operators::CinnLaunchOpKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/cinn_launch_op.h b/paddle/fluid/operators/cinn_launch_op.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/scope.h"
+#include "cinn/runtime/cinn_runtime.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#include "paddle/fluid/operators/cinn_launch_op_helper.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace operators {
+
+static constexpr char kX[] = "X";
+static constexpr char kOutputs[] = "Out";
+static constexpr char kCompilationKey[] = "compilation_key";
+
+using LoDTensor = framework::LoDTensor;
+using Name2ConstTensor = std::map<std::string, const LoDTensor*>;
+using CinnTensor = cinn::hlir::framework::Tensor;
+using Name2CinnTensor = std::unordered_map<std::string, CinnTensor>;
+using framework::paddle2cinn::CinnCompiler;
+
+template <typename DeviceContext, typename T>
+class CinnLaunchOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // Step 1. Find graph object and prepare input
+    PADDLE_ENFORCE_EQ(ctx.HasAttr(kCompilationKey), true,
+                      platform::errors::NotFound(
+                          "No Attribute(%s) found for CinnLaunchOp operator.",
+                          kCompilationKey));
+    const auto& compilation_key =
+        ctx.template Attr<std::string>(kCompilationKey);
+    VLOG(2) << "CinnLaunchOp compilation_key:" << compilation_key;
+
+    const auto& graph = CinnCompiler::GetInstance()->FindGraph(compilation_key);
+    auto input_variable_names = ctx.InputNames(kX);
+    Name2ConstTensor input_tensors =
+        details::GetConstTensors(ctx.scope(), input_variable_names);
+
+    // Step 2. Get compilation result of the graph
+    auto target = details::PlaceToCinnTarget(ctx.GetPlace());
+    const auto& cinn_compiled_object =
+        CinnCompiler::GetInstance()->Compile(graph, input_tensors, target);
+    VLOG(2) << "CinnLaunchOp compile graph done on " << ctx.GetPlace();
+
+    const auto& cinn_runtime_program = cinn_compiled_object.runtime_program;
+    const auto& compiled_scope = *(cinn_compiled_object.scope.get());
+    const auto& paddle2cinn_varmap = cinn_compiled_object.paddle2cinn_varmap;
+
+    // Step 3. Initialize all variables of the compilation runtime program
+    //         in paddle, and pack them into execution arguments
+    VLOG(2) << "CinnLaunchOp prepare execution arguments";
+    std::map<std::string, cinn_pod_value_t> name2argument;
+    std::vector<std::unique_ptr<cinn_buffer_t>> hold_buffers;
+    // prepare input variables
+    Name2CinnTensor input_compiled_tensors = details::GetCompiledTensors(
+        input_variable_names, compiled_scope, paddle2cinn_varmap);
+    details::CheckTensorEquivalent(input_tensors, input_compiled_tensors);
+    details::AppendExecutionArguments(ctx.scope(), input_variable_names,
+                                      paddle2cinn_varmap, &name2argument,
+                                      &hold_buffers);
+    // prepare output variables
+    auto output_variable_names = ctx.OutputNames(kOutputs);
+    Name2CinnTensor output_compiled_tensors = details::GetCompiledTensors(
+        output_variable_names, compiled_scope, paddle2cinn_varmap);
+    details::InitializeOutputVar(ctx.scope(), ctx.GetPlace(),
+                                 output_compiled_tensors);
+    Name2ConstTensor output_tensors =
+        details::GetConstTensors(ctx.scope(), output_variable_names);
+    details::CheckTensorEquivalent(output_tensors, output_compiled_tensors);
+    details::AppendExecutionArguments(ctx.scope(), output_variable_names,
+                                      paddle2cinn_varmap, &name2argument,
+                                      &hold_buffers);
+    // prepare temporary variables
+    auto temp_variable_names =
+        details::SeperateTempVar(compiled_scope, paddle2cinn_varmap,
+                                 input_variable_names, output_variable_names);
+    auto temp_scope = ctx.scope().NewTmpScope();
+    if (!temp_variable_names.empty()) {
+      details::InitializeTempVar(temp_variable_names, compiled_scope,
+                                 ctx.GetPlace(), temp_scope.get());
+      details::AppendExecutionArguments(*temp_scope, temp_variable_names,
+                                        paddle2cinn_varmap, &name2argument,
+                                        &hold_buffers);
+    }
+    // Step 4. Launch CINN to execute the compilation runtime program
+    cinn_runtime_program->Execute(&name2argument);
+    VLOG(2) << "CinnLaunchOp launch runtime_program execution done.";
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle