Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support trt cuda graph. #53406

Merged
merged 10 commits into from
May 9, 2023
1 change: 1 addition & 0 deletions paddle/fluid/inference/analysis/argument.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ struct Argument {
TensorRtUseStaticEngine,
bool);
DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
DECL_ARGUMENT_FIELD(tensorrt_use_cuda_graph, TensorRtUseCudaGraph, bool);
DECL_ARGUMENT_FIELD(tensorrt_use_varseqlen, TensorRtUseOSS, bool);
DECL_ARGUMENT_FIELD(tensorrt_with_interleaved, TensorRtWithInterleaved, bool);
DECL_ARGUMENT_FIELD(tensorrt_transformer_posid,
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/inference/analysis/ir_pass_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ void IRPassManager::CreatePasses(Argument *argument,
new AnalysisConfig::Precision(precision_mode));
pass->Set("context_memory_sharing",
new bool(argument->trt_engine_memory_sharing()));
pass->Set("use_cuda_graph",
new bool(argument->tensorrt_use_cuda_graph()));
bool use_static_engine = argument->tensorrt_use_static_engine();
bool model_from_memory = argument->model_from_memory();
std::string optim_cache_dir = argument->optim_cache_dir();
Expand Down
75 changes: 65 additions & 10 deletions paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,22 @@ void OutputProcess(framework::ir::Graph *graph,
}
}

// Determine whether the whole graph offload to tensorrt. If so we can try to
// enable optimization such as cudaGraph.
bool AllNodesLowerToTrtPostProcess(framework::ir::Graph *graph) {
std::unordered_set<std::string> trt_nodes_set{
"feed", "fetch", "tensorrt_engine"};
bool all_nodes_offload_to_trt = true;
for (auto *node : graph->Nodes()) {
if (node->IsOp()) {
if (!trt_nodes_set.count(node->Op()->Type())) {
all_nodes_offload_to_trt = false;
break;
}
}
}
return all_nodes_offload_to_trt;
}
} // namespace

using framework::ir::Node;
Expand All @@ -124,6 +140,7 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(

auto enable_int8 = Get<bool>("enable_int8");
auto use_calib_mode = Get<bool>("use_calib_mode");
bool use_cuda_graph = Get<bool>("use_cuda_graph");
bool no_calib_int8 = enable_int8 && !(use_calib_mode);
auto trt_disabled_ops = Get<std::vector<std::string>>("trt_disabled_ops");
auto with_dynamic_shape = Get<bool>("with_dynamic_shape");
Expand Down Expand Up @@ -165,13 +182,11 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
// those parameter already exist in trt, and should not have another copy in
// fluid.
std::vector<std::string> repetitive_params;
std::vector<std::string> engine_names;
for (auto *node : graph->Nodes()) {
if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) {
CreateTensorRTOp(node, graph, graph_param_names, &repetitive_params);
std::unordered_set<const Node *> nodes2remove(
framework::ir::Agent(node).subgraph()->begin(),
framework::ir::Agent(node).subgraph()->end());
framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
engine_names.push_back(CreateTensorRTOp(
node, graph, graph_param_names, &repetitive_params, use_cuda_graph));
}
}

Expand All @@ -184,13 +199,40 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
graph->Set(framework::ir::kRepetitiveParamAttr,
new std::vector<std::string>(repetitive_params));

bool all_nodes_offload_to_trt = AllNodesLowerToTrtPostProcess(graph);
if (all_nodes_offload_to_trt) {
LOG(INFO) << "The entire graph is offloaded to TensorRT.";
}
if (use_cuda_graph && !all_nodes_offload_to_trt) {
LOG_FIRST_N(WARNING, 1)
<< "You have enabled CudaGraph, but not the entire graph offload to "
"trt, now return to normal mode.";
use_cuda_graph = false;
}
if (use_cuda_graph && all_nodes_offload_to_trt) {
for (auto &name : engine_names) {
PADDLE_ENFORCE_EQ(
paddle::inference::Singleton<
inference::tensorrt::TRTEngineManager>::Global()
.Has(name),
true,
platform::errors::PreconditionNotMet(
"TRTEnegineManager shoud has engine %s, but not found.", name));
paddle::inference::Singleton<
inference::tensorrt::TRTEngineManager>::Global()
.Get(name)
->SetAllNodesLowerToTrt(use_cuda_graph);
}
}
}

std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
const std::set<std::string> &engine_outputs,
const std::string &predictor_id,
const std::string &max_batch_size,
const std::string &precision,
bool use_cuda_graph,
const bool for_calibration) {
std::string engine_hash_key = "";
for (auto name : engine_inputs) {
Expand All @@ -209,17 +251,21 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
engine_hash_key += "#";
engine_hash_key += precision;

engine_hash_key += "#";
engine_hash_key += use_cuda_graph;

auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
VLOG(2) << "TRT engine hash key: " << engine_hash_key;
VLOG(2) << "TRT engine key: " << engine_key;
return engine_key;
}

void TensorRtSubgraphPass::CreateTensorRTOp(
std::string TensorRtSubgraphPass::CreateTensorRTOp(
framework::ir::Node *node,
framework::ir::Graph *graph,
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const {
std::vector<std::string> *repetitive_params,
bool use_cuda_graph) const {
auto *op_desc = node->Op();
auto &subgraph = *framework::ir::Agent(node).subgraph();
PADDLE_ENFORCE_EQ(subgraph.empty(),
Expand Down Expand Up @@ -506,13 +552,15 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
std::to_string(0),
std::to_string(max_batch_size),
std::to_string(static_cast<int>(precision_mode)),
use_cuda_graph,
false);
auto calibration_engine_key =
GenerateEngineKey(input_names_with_id,
output_names_with_id,
std::to_string(0),
std::to_string(max_batch_size),
std::to_string(static_cast<int>(precision_mode)),
use_cuda_graph,
true);
auto predictor_id = Get<int>("predictor_id");

Expand Down Expand Up @@ -547,7 +595,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
(enable_int8 && calibration_data.size() == 0 && use_calib_mode);
if (calibration_mode) {
// calibraion mode means generate int8 calibration table data process.
return;
return calibration_engine_key;
}

std::copy(params_not_shared.begin(),
Expand Down Expand Up @@ -582,6 +630,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
"recommend using the same TRT version at runtime.";
}

std::unordered_set<const Node *> nodes2remove(
framework::ir::Agent(node).subgraph()->begin(),
framework::ir::Agent(node).subgraph()->end());
framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);

// Setting the disable_trt_plugin_fp16 to true means that TRT plugin will not
// run fp16.
// When running fp16, the output accuracy of the model will be affected,
Expand Down Expand Up @@ -628,7 +681,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
LOG(INFO) << "Load TRT Optimized Info from "
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
return;
return engine_key + std::to_string(predictor_id);
} catch (const std::exception &exp) {
LOG(WARNING)
<< "Fail to load TRT Optimized Info from "
Expand All @@ -643,7 +696,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
// If with_dynamic_shape is configured,but min_input_shape is empty,
// create trt engine in runtime instead of in pass.
if (with_dynamic_shape && min_input_shape.empty()) {
return;
return engine_key + std::to_string(predictor_id);
}

// the following code will NOT run in following situation:
Expand Down Expand Up @@ -676,6 +729,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
}

return engine_key + std::to_string(predictor_id);
}

} // namespace analysis
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,11 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase {
void ApplyImpl(framework::ir::Graph *graph) const override;

private:
void CreateTensorRTOp(framework::ir::Node *x,
framework::ir::Graph *graph,
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const;
std::string CreateTensorRTOp(framework::ir::Node *x,
framework::ir::Graph *graph,
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params,
bool use_cuda_graph) const;
void CleanIntermediateOutputs(framework::ir::Node *node);
};

Expand Down
12 changes: 11 additions & 1 deletion paddle/fluid/inference/api/analysis_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <string>
#include <tuple>

#include "glog/logging.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
Expand Down Expand Up @@ -442,6 +443,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(trt_dla_core_);
CP_MEMBER(trt_use_static_engine_);
CP_MEMBER(trt_use_calib_mode_);
CP_MEMBER(trt_use_cuda_graph_);
CP_MEMBER(trt_use_varseqlen_);
CP_MEMBER(trt_with_interleaved_);
CP_MEMBER(tensorrt_transformer_posid_);
Expand Down Expand Up @@ -719,7 +721,8 @@ void AnalysisConfig::EnableTensorRtEngine(
int min_subgraph_size,
AnalysisConfig::Precision precision_mode,
bool use_static,
bool use_calib_mode) {
bool use_calib_mode,
bool use_cuda_graph) {
#ifdef PADDLE_WITH_TENSORRT
if (!use_gpu()) {
LOG(ERROR) << "To use TensorRT engine, please call EnableUseGpu() first";
Expand All @@ -733,6 +736,11 @@ void AnalysisConfig::EnableTensorRtEngine(
tensorrt_precision_mode_ = precision_mode;
trt_use_static_engine_ = use_static;
trt_use_calib_mode_ = use_calib_mode;
trt_use_cuda_graph_ = use_cuda_graph;
if (use_cuda_graph) {
LOG_FIRST_N(INFO, 1) << "You have enabled Trt Cuda Graph, you must ensure "
"that the input Shape remains unchanged.";
}

Update();
#else
Expand Down Expand Up @@ -1313,6 +1321,8 @@ std::string AnalysisConfig::Summary() {
trt_use_static_engine_ ? "true" : "false"});
os.InsertRow(
{"tensorrt_use_calib_mode", trt_use_calib_mode_ ? "true" : "false"});
os.InsertRow(
{"tensorrt_use_cuda_graph", trt_use_cuda_graph_ ? "true" : "false"});

// dynamic_shape
os.InsertRow({"tensorrt_enable_dynamic_shape",
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1354,6 +1354,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_->SetTensorRtDLACore(config_.trt_dla_core_);
argument_->SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
argument_->SetTensorRtUseCalibMode(config_.trt_use_calib_mode_);
argument_->SetTensorRtUseCudaGraph(config_.trt_use_cuda_graph_);
argument_->SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
argument_->SetTensorRtShapeRangeInfoPath(config_.shape_range_info_path());
argument_->SetTensorRtAllowBuildAtRuntime(
Expand Down
7 changes: 6 additions & 1 deletion paddle/fluid/inference/api/paddle_analysis_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -586,14 +586,18 @@ struct PD_INFER_DECL AnalysisConfig {
/// \param use_static Serialize optimization information to disk for reusing.
/// \param use_calib_mode Use TRT int8 calibration(post training
/// quantization).
/// \param use_cuda_graph Use CudaGraph to reduce the time consumption of
/// enqueue. Note that this option can only be enabled when your input is
/// constant (including the batch dimension).
///
///
void EnableTensorRtEngine(int64_t workspace_size = 1 << 30,
int max_batch_size = 1,
int min_subgraph_size = 3,
Precision precision = Precision::kFloat32,
bool use_static = false,
bool use_calib_mode = true);
bool use_calib_mode = true,
bool use_cuda_graph = false);
///
/// \brief A boolean state telling whether the TensorRT engine is used.
///
Expand Down Expand Up @@ -1114,6 +1118,7 @@ struct PD_INFER_DECL AnalysisConfig {
Precision tensorrt_precision_mode_{Precision::kFloat32};
bool trt_use_static_engine_{false};
bool trt_use_calib_mode_{true};
bool trt_use_cuda_graph_{false};
bool trt_use_varseqlen_{false};
bool trt_with_interleaved_{false};
std::string tensorrt_transformer_posid_{""};
Expand Down
53 changes: 51 additions & 2 deletions paddle/fluid/inference/tensorrt/engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/enforce.h"

namespace paddle {
namespace inference {
Expand Down Expand Up @@ -129,12 +130,60 @@ void TensorRTEngine::Execute(int batch_size,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
infer_context->setDeviceMemory(context_memory);
}

// TODO(wilber): Is cudaGraph has conflict with memory sharing?
if (startup_with_cudagraph_ && !cudagraph_inited_) {
// Avoid capturing initialization calls by executing the enqueue function at
// least once before starting CUDA graph capture.
const auto ret = Enqueue(infer_context, buffers, batch_size, stream);
PADDLE_ENFORCE_EQ(
ret,
true,
phi::errors::PreconditionNotMet("Trt CudaGraph test run failed."));
cudaStreamSynchronize(stream);

cuda_graph_.BeginCapture(stream);
// The built TRT engine may contain operations that are not permitted under
// CUDA graph capture mode. When the stream is capturing, the call may
// return false if the current CUDA graph capture fails.
if (Enqueue(infer_context, buffers, batch_size, stream)) {
cuda_graph_.EndCapture(stream);
cudagraph_inited_ = true;
} else {
cuda_graph_.EndCaptureOnError(stream);
// Ensure any CUDA error has been cleaned up.
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
LOG(WARNING) << "The built TensorRT engine contains operations that are "
"not permitted under "
"CUDA graph capture mode. The specified UseCudaGraph "
"flag has been ignored. The inference will be "
"launched without using CUDA graph launch.";
cudagraph_inited_ = false;
}
startup_with_cudagraph_ = false;
}

Enqueue(infer_context, buffers, batch_size, stream);
}

bool TensorRTEngine::Enqueue(nvinfer1::IExecutionContext *context,
std::vector<void *> *buffers,
int batch_size,
cudaStream_t stream) {
if (cudagraph_inited_) {
VLOG(1) << "cuda_graph init success, so we will use cuda graph launch the "
"entire graph.";
return cuda_graph_.Launch(stream);
}
Comment on lines +169 to +177
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: shape检查与报错


bool ret;
if (!with_dynamic_shape()) {
infer_context->enqueue(batch_size, buffers->data(), stream, nullptr);
ret = context->enqueue(batch_size, buffers->data(), stream, nullptr);
} else {
infer_context->enqueueV2(buffers->data(), stream, nullptr);
ret = context->enqueueV2(buffers->data(), stream, nullptr);
}
SetRuntimeBatch(batch_size);
return ret;
}

void TensorRTEngine::FreezeNetwork() {
Expand Down
Loading