Skip to content

Commit

Permalink
Support trt cuda graph. (#53406)
Browse files Browse the repository at this point in the history
  • Loading branch information
jiweibo authored May 9, 2023
1 parent 72cb09e commit ea0abf9
Show file tree
Hide file tree
Showing 13 changed files with 245 additions and 20 deletions.
1 change: 1 addition & 0 deletions paddle/fluid/inference/analysis/argument.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ struct Argument {
TensorRtUseStaticEngine,
bool);
DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
DECL_ARGUMENT_FIELD(tensorrt_use_cuda_graph, TensorRtUseCudaGraph, bool);
DECL_ARGUMENT_FIELD(tensorrt_use_varseqlen, TensorRtUseOSS, bool);
DECL_ARGUMENT_FIELD(tensorrt_with_interleaved, TensorRtWithInterleaved, bool);
DECL_ARGUMENT_FIELD(tensorrt_transformer_posid,
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/inference/analysis/ir_pass_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ void IRPassManager::CreatePasses(Argument *argument,
new AnalysisConfig::Precision(precision_mode));
pass->Set("context_memory_sharing",
new bool(argument->trt_engine_memory_sharing()));
pass->Set("use_cuda_graph",
new bool(argument->tensorrt_use_cuda_graph()));
bool use_static_engine = argument->tensorrt_use_static_engine();
bool model_from_memory = argument->model_from_memory();
std::string optim_cache_dir = argument->optim_cache_dir();
Expand Down
75 changes: 65 additions & 10 deletions paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,22 @@ void OutputProcess(framework::ir::Graph *graph,
}
}

// Determine whether the whole graph offload to tensorrt. If so we can try to
// enable optimization such as cudaGraph.
bool AllNodesLowerToTrtPostProcess(framework::ir::Graph *graph) {
std::unordered_set<std::string> trt_nodes_set{
"feed", "fetch", "tensorrt_engine"};
bool all_nodes_offload_to_trt = true;
for (auto *node : graph->Nodes()) {
if (node->IsOp()) {
if (!trt_nodes_set.count(node->Op()->Type())) {
all_nodes_offload_to_trt = false;
break;
}
}
}
return all_nodes_offload_to_trt;
}
} // namespace

using framework::ir::Node;
Expand All @@ -124,6 +140,7 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(

auto enable_int8 = Get<bool>("enable_int8");
auto use_calib_mode = Get<bool>("use_calib_mode");
bool use_cuda_graph = Get<bool>("use_cuda_graph");
bool no_calib_int8 = enable_int8 && !(use_calib_mode);
auto trt_disabled_ops = Get<std::vector<std::string>>("trt_disabled_ops");
auto with_dynamic_shape = Get<bool>("with_dynamic_shape");
Expand Down Expand Up @@ -165,13 +182,11 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
// those parameter already exist in trt, and should not have another copy in
// fluid.
std::vector<std::string> repetitive_params;
std::vector<std::string> engine_names;
for (auto *node : graph->Nodes()) {
if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) {
CreateTensorRTOp(node, graph, graph_param_names, &repetitive_params);
std::unordered_set<const Node *> nodes2remove(
framework::ir::Agent(node).subgraph()->begin(),
framework::ir::Agent(node).subgraph()->end());
framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
engine_names.push_back(CreateTensorRTOp(
node, graph, graph_param_names, &repetitive_params, use_cuda_graph));
}
}

Expand All @@ -184,13 +199,40 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
graph->Set(framework::ir::kRepetitiveParamAttr,
new std::vector<std::string>(repetitive_params));

bool all_nodes_offload_to_trt = AllNodesLowerToTrtPostProcess(graph);
if (all_nodes_offload_to_trt) {
LOG(INFO) << "The entire graph is offloaded to TensorRT.";
}
if (use_cuda_graph && !all_nodes_offload_to_trt) {
LOG_FIRST_N(WARNING, 1)
<< "You have enabled CudaGraph, but not the entire graph offload to "
"trt, now return to normal mode.";
use_cuda_graph = false;
}
if (use_cuda_graph && all_nodes_offload_to_trt) {
for (auto &name : engine_names) {
PADDLE_ENFORCE_EQ(
paddle::inference::Singleton<
inference::tensorrt::TRTEngineManager>::Global()
.Has(name),
true,
platform::errors::PreconditionNotMet(
"TRTEnegineManager shoud has engine %s, but not found.", name));
paddle::inference::Singleton<
inference::tensorrt::TRTEngineManager>::Global()
.Get(name)
->SetAllNodesLowerToTrt(use_cuda_graph);
}
}
}

std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
const std::set<std::string> &engine_outputs,
const std::string &predictor_id,
const std::string &max_batch_size,
const std::string &precision,
bool use_cuda_graph,
const bool for_calibration) {
std::string engine_hash_key = "";
for (auto name : engine_inputs) {
Expand All @@ -209,17 +251,21 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
engine_hash_key += "#";
engine_hash_key += precision;

engine_hash_key += "#";
engine_hash_key += use_cuda_graph;

auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
VLOG(2) << "TRT engine hash key: " << engine_hash_key;
VLOG(2) << "TRT engine key: " << engine_key;
return engine_key;
}

void TensorRtSubgraphPass::CreateTensorRTOp(
std::string TensorRtSubgraphPass::CreateTensorRTOp(
framework::ir::Node *node,
framework::ir::Graph *graph,
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const {
std::vector<std::string> *repetitive_params,
bool use_cuda_graph) const {
auto *op_desc = node->Op();
auto &subgraph = *framework::ir::Agent(node).subgraph();
PADDLE_ENFORCE_EQ(subgraph.empty(),
Expand Down Expand Up @@ -506,13 +552,15 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
std::to_string(0),
std::to_string(max_batch_size),
std::to_string(static_cast<int>(precision_mode)),
use_cuda_graph,
false);
auto calibration_engine_key =
GenerateEngineKey(input_names_with_id,
output_names_with_id,
std::to_string(0),
std::to_string(max_batch_size),
std::to_string(static_cast<int>(precision_mode)),
use_cuda_graph,
true);
auto predictor_id = Get<int>("predictor_id");

Expand Down Expand Up @@ -547,7 +595,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
(enable_int8 && calibration_data.size() == 0 && use_calib_mode);
if (calibration_mode) {
// calibraion mode means generate int8 calibration table data process.
return;
return calibration_engine_key;
}

std::copy(params_not_shared.begin(),
Expand Down Expand Up @@ -582,6 +630,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
"recommend using the same TRT version at runtime.";
}

std::unordered_set<const Node *> nodes2remove(
framework::ir::Agent(node).subgraph()->begin(),
framework::ir::Agent(node).subgraph()->end());
framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);

// Setting the disable_trt_plugin_fp16 to true means that TRT plugin will not
// run fp16.
// When running fp16, the output accuracy of the model will be affected,
Expand Down Expand Up @@ -628,7 +681,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
LOG(INFO) << "Load TRT Optimized Info from "
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
return;
return engine_key + std::to_string(predictor_id);
} catch (const std::exception &exp) {
LOG(WARNING)
<< "Fail to load TRT Optimized Info from "
Expand All @@ -643,7 +696,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
// If with_dynamic_shape is configured,but min_input_shape is empty,
// create trt engine in runtime instead of in pass.
if (with_dynamic_shape && min_input_shape.empty()) {
return;
return engine_key + std::to_string(predictor_id);
}

// the following code will NOT run in following situation:
Expand Down Expand Up @@ -676,6 +729,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
}

return engine_key + std::to_string(predictor_id);
}

} // namespace analysis
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,11 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase {
void ApplyImpl(framework::ir::Graph *graph) const override;

private:
void CreateTensorRTOp(framework::ir::Node *x,
framework::ir::Graph *graph,
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params) const;
std::string CreateTensorRTOp(framework::ir::Node *x,
framework::ir::Graph *graph,
const std::vector<std::string> &graph_params,
std::vector<std::string> *repetitive_params,
bool use_cuda_graph) const;
void CleanIntermediateOutputs(framework::ir::Node *node);
};

Expand Down
12 changes: 11 additions & 1 deletion paddle/fluid/inference/api/analysis_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <string>
#include <tuple>

#include "glog/logging.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
Expand Down Expand Up @@ -442,6 +443,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(trt_dla_core_);
CP_MEMBER(trt_use_static_engine_);
CP_MEMBER(trt_use_calib_mode_);
CP_MEMBER(trt_use_cuda_graph_);
CP_MEMBER(trt_use_varseqlen_);
CP_MEMBER(trt_with_interleaved_);
CP_MEMBER(tensorrt_transformer_posid_);
Expand Down Expand Up @@ -719,7 +721,8 @@ void AnalysisConfig::EnableTensorRtEngine(
int min_subgraph_size,
AnalysisConfig::Precision precision_mode,
bool use_static,
bool use_calib_mode) {
bool use_calib_mode,
bool use_cuda_graph) {
#ifdef PADDLE_WITH_TENSORRT
if (!use_gpu()) {
LOG(ERROR) << "To use TensorRT engine, please call EnableUseGpu() first";
Expand All @@ -733,6 +736,11 @@ void AnalysisConfig::EnableTensorRtEngine(
tensorrt_precision_mode_ = precision_mode;
trt_use_static_engine_ = use_static;
trt_use_calib_mode_ = use_calib_mode;
trt_use_cuda_graph_ = use_cuda_graph;
if (use_cuda_graph) {
LOG_FIRST_N(INFO, 1) << "You have enabled Trt Cuda Graph, you must ensure "
"that the input Shape remains unchanged.";
}

Update();
#else
Expand Down Expand Up @@ -1313,6 +1321,8 @@ std::string AnalysisConfig::Summary() {
trt_use_static_engine_ ? "true" : "false"});
os.InsertRow(
{"tensorrt_use_calib_mode", trt_use_calib_mode_ ? "true" : "false"});
os.InsertRow(
{"tensorrt_use_cuda_graph", trt_use_cuda_graph_ ? "true" : "false"});

// dynamic_shape
os.InsertRow({"tensorrt_enable_dynamic_shape",
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1352,6 +1352,7 @@ void AnalysisPredictor::PrepareArgument() {
argument_->SetTensorRtDLACore(config_.trt_dla_core_);
argument_->SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
argument_->SetTensorRtUseCalibMode(config_.trt_use_calib_mode_);
argument_->SetTensorRtUseCudaGraph(config_.trt_use_cuda_graph_);
argument_->SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
argument_->SetTensorRtShapeRangeInfoPath(config_.shape_range_info_path());
argument_->SetTensorRtAllowBuildAtRuntime(
Expand Down
7 changes: 6 additions & 1 deletion paddle/fluid/inference/api/paddle_analysis_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -586,14 +586,18 @@ struct PD_INFER_DECL AnalysisConfig {
/// \param use_static Serialize optimization information to disk for reusing.
/// \param use_calib_mode Use TRT int8 calibration(post training
/// quantization).
/// \param use_cuda_graph Use CudaGraph to reduce the time consumption of
/// enqueue. Note that this option can only be enabled when your input is
/// constant (including the batch dimension).
///
///
void EnableTensorRtEngine(int64_t workspace_size = 1 << 30,
int max_batch_size = 1,
int min_subgraph_size = 3,
Precision precision = Precision::kFloat32,
bool use_static = false,
bool use_calib_mode = true);
bool use_calib_mode = true,
bool use_cuda_graph = false);
///
/// \brief A boolean state telling whether the TensorRT engine is used.
///
Expand Down Expand Up @@ -1114,6 +1118,7 @@ struct PD_INFER_DECL AnalysisConfig {
Precision tensorrt_precision_mode_{Precision::kFloat32};
bool trt_use_static_engine_{false};
bool trt_use_calib_mode_{true};
bool trt_use_cuda_graph_{false};
bool trt_use_varseqlen_{false};
bool trt_with_interleaved_{false};
std::string tensorrt_transformer_posid_{""};
Expand Down
53 changes: 51 additions & 2 deletions paddle/fluid/inference/tensorrt/engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/enforce.h"

namespace paddle {
namespace inference {
Expand Down Expand Up @@ -129,12 +130,60 @@ void TensorRTEngine::Execute(int batch_size,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
infer_context->setDeviceMemory(context_memory);
}

// TODO(wilber): Is cudaGraph has conflict with memory sharing?
if (startup_with_cudagraph_ && !cudagraph_inited_) {
// Avoid capturing initialization calls by executing the enqueue function at
// least once before starting CUDA graph capture.
const auto ret = Enqueue(infer_context, buffers, batch_size, stream);
PADDLE_ENFORCE_EQ(
ret,
true,
phi::errors::PreconditionNotMet("Trt CudaGraph test run failed."));
cudaStreamSynchronize(stream);

cuda_graph_.BeginCapture(stream);
// The built TRT engine may contain operations that are not permitted under
// CUDA graph capture mode. When the stream is capturing, the call may
// return false if the current CUDA graph capture fails.
if (Enqueue(infer_context, buffers, batch_size, stream)) {
cuda_graph_.EndCapture(stream);
cudagraph_inited_ = true;
} else {
cuda_graph_.EndCaptureOnError(stream);
// Ensure any CUDA error has been cleaned up.
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetLastError());
LOG(WARNING) << "The built TensorRT engine contains operations that are "
"not permitted under "
"CUDA graph capture mode. The specified UseCudaGraph "
"flag has been ignored. The inference will be "
"launched without using CUDA graph launch.";
cudagraph_inited_ = false;
}
startup_with_cudagraph_ = false;
}

Enqueue(infer_context, buffers, batch_size, stream);
}

bool TensorRTEngine::Enqueue(nvinfer1::IExecutionContext *context,
std::vector<void *> *buffers,
int batch_size,
cudaStream_t stream) {
if (cudagraph_inited_) {
VLOG(1) << "cuda_graph init success, so we will use cuda graph launch the "
"entire graph.";
return cuda_graph_.Launch(stream);
}

bool ret;
if (!with_dynamic_shape()) {
infer_context->enqueue(batch_size, buffers->data(), stream, nullptr);
ret = context->enqueue(batch_size, buffers->data(), stream, nullptr);
} else {
infer_context->enqueueV2(buffers->data(), stream, nullptr);
ret = context->enqueueV2(buffers->data(), stream, nullptr);
}
SetRuntimeBatch(batch_size);
return ret;
}

void TensorRTEngine::FreezeNetwork() {
Expand Down
Loading

0 comments on commit ea0abf9

Please sign in to comment.