From 51462f7dcadcfe94ceaf6fcead2943a84bb9b19a Mon Sep 17 00:00:00 2001 From: wenbin Date: Tue, 21 Dec 2021 04:57:44 +0000 Subject: [PATCH 01/12] compile done --- paddle/fluid/framework/naive_executor.cc | 35 +++++++++++ paddle/fluid/framework/naive_executor.h | 2 + .../fluid/inference/api/analysis_predictor.h | 1 + paddle/fluid/inference/tensorrt/engine.cc | 60 ++++++++++--------- paddle/fluid/inference/tensorrt/engine.h | 30 +++++++++- .../operators/tensorrt/tensorrt_engine_op.h | 56 ++++++++++------- 6 files changed, 134 insertions(+), 50 deletions(-) diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 7d55d8c41e3e9..e7dc4eab74a2f 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -20,6 +20,9 @@ #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#if PADDLE_WITH_TENSORRT +#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h" +#endif namespace paddle { namespace framework { @@ -132,5 +135,37 @@ NaiveExecutor::~NaiveExecutor() { #endif } +void NaiveExecutor::ResetTrtOps(int num) { +#if PADDLE_WITH_TENSORRT + for (auto &op : ops_) { + if (op->Type() == "tensorrt_engine") { + operators::TensorRTEngineOp *trtop = + dynamic_cast(op.get()); + if (!trtop) return; + std::string engine_key = trtop->Attr("engine_key"); + int engine_predictor_id = trtop->Attr("predictor_id"); + std::string engine_name = + engine_key + std::to_string(engine_predictor_id); + operators::TensorRTEngine *trt_engine = + paddle::inference::Singleton< + inference::tensorrt::TRTEngineManager>::Global() + .Get(engine_name); + if (trt_engine->with_dynamic_shape()) { + trt_engine->ResetContext(); + trt_engine->ClearTensorMap(); + trt_engine->SetProfileNum(num); + auto *anc = scope_->parent(); + while (anc && anc->parent()) { + anc = anc->parent(); + } + if (anc == nullptr) { + anc = scope_; + } + trtop->PrepareTRTEngine(*anc, trt_engine); + } + } + } +#endif +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h index f38632a9a639c..ed475e66f626d 100644 --- a/paddle/fluid/framework/naive_executor.h +++ b/paddle/fluid/framework/naive_executor.h @@ -63,6 +63,8 @@ class NaiveExecutor { void CleanFeedFetchOps(); + void ResetTrtOps(int num); + protected: void CreateOps(const ProgramDesc& desc, int block_id, bool with_feed_fetch_ops); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 9c36051757527..4c7b8e081300d 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -429,6 +429,7 @@ class AnalysisPredictor : public PaddlePredictor { bool status_is_cloned_{false}; std::map>> shape_info_; + int clone_num{1}; }; } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index cb815e00c4430..669eb165614a7 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -42,7 +42,10 @@ void TensorRTEngine::InitNetwork() { } infer_builder_config_.reset(infer_builder_->createBuilderConfig()); - optim_profile_ = infer_builder_->createOptimizationProfile(); + // optim_profile_ = infer_builder_->createOptimizationProfile(); + optim_profiles_.resize(max_profile_num_); + for (int i = 0; i < max_profile_num_; i++) + optim_profiles_[i] = infer_builder_->createOptimizationProfile(); } void TensorRTEngine::Execute(int batch_size, std::vector *buffers, @@ -199,35 +202,37 @@ void TensorRTEngine::FreezeNetwork() { if (with_dynamic_shape_) { #if IS_TRT_VERSION_GE(6000) LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode."; - for (auto &input : min_input_shape_) { + for (int i = 0; i < max_profile_num_; i++) { + for (auto &input : min_input_shape_) { #if IS_TRT_VERSION_LT(7000) - // trt6 will check all_of input > 0 - if (!(std::all_of(input.second.begin(), input.second.end(), - [](int x) { return x > 0; }) && - std::all_of(max_input_shape_[input.first].begin(), - max_input_shape_[input.first].end(), - [](int x) { return x > 0; }) && - std::all_of(optim_input_shape_[input.first].begin(), - optim_input_shape_[input.first].end(), - [](int x) { return x > 0; }))) { - continue; - } + // trt6 will check all_of input > 0 + if (!(std::all_of(input.second.begin(), input.second.end(), + [](int x) { return x > 0; }) && + std::all_of(max_input_shape_[input.first].begin(), + max_input_shape_[input.first].end(), + [](int x) { return x > 0; }) && + std::all_of(optim_input_shape_[input.first].begin(), + optim_input_shape_[input.first].end(), + [](int x) { return x > 0; }))) { + continue; + } #endif - VLOG(4) << "TRT dynamic_shape set " << input.first - << " min: " << Vec2Str(input.second) - << ", max: " << Vec2Str(max_input_shape_[input.first]) - << ", opt: " << Vec2Str(optim_input_shape_[input.first]); - optim_profile_->setDimensions( - input.first.c_str(), nvinfer1::OptProfileSelector::kMIN, - Vec2TRT_Dims(input.second, input.first, true)); - optim_profile_->setDimensions( - input.first.c_str(), nvinfer1::OptProfileSelector::kMAX, - Vec2TRT_Dims(max_input_shape_[input.first], input.first, true)); - optim_profile_->setDimensions( - input.first.c_str(), nvinfer1::OptProfileSelector::kOPT, - Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true)); + VLOG(4) << "TRT dynamic_shape set " << input.first + << " min: " << Vec2Str(input.second) + << ", max: " << Vec2Str(max_input_shape_[input.first]) + << ", opt: " << Vec2Str(optim_input_shape_[input.first]); + optim_profiles_[i]->setDimensions( + input.first.c_str(), nvinfer1::OptProfileSelector::kMIN, + Vec2TRT_Dims(input.second, input.first, true)); + optim_profiles_[i]->setDimensions( + input.first.c_str(), nvinfer1::OptProfileSelector::kMAX, + Vec2TRT_Dims(max_input_shape_[input.first], input.first, true)); + optim_profiles_[i]->setDimensions( + input.first.c_str(), nvinfer1::OptProfileSelector::kOPT, + Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true)); + } + infer_builder_config_->addOptimizationProfile(optim_profiles_[i]); } - infer_builder_config_->addOptimizationProfile(optim_profile_); if (WithFp16() && disable_trt_plugin_fp16()) { LOG(INFO) << "NOTE: In order to achieve higher accuracy, you have " "disabled the fp16 mode of TRT Plugin,\n" @@ -236,6 +241,7 @@ void TensorRTEngine::FreezeNetwork() { "opt_shape, false /*disable_trt_plugin_fp16*/)'"; } #endif + binding_num_ = infer_engine_->getNbBindings(); } #if IS_TRT_VERSION_GE(8200) diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index e6f58c8c8e8f4..5203d5699150d 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -254,9 +254,31 @@ class TensorRTEngine { platform::errors::InvalidArgument( "You should build engine first and then set the context.")); infer_context_[tid].reset(infer_engine_->createExecutionContext()); + if (with_dynamic_shape_) { + // need new profile if it's not the first + if (profile_num_ > 0) { + infer_context_[tid]->setOptimizationProfile(profile_num_); + } + profile_index_[tid] = profile_num_; + ++profile_num_; + } } return infer_context_[tid].get(); } + + int GetProfileIndex() { + // re-think!!! + std::unique_lock lock(mutex_); + const std::thread::id tid = std::this_thread::get_id(); + return profile_index_[tid]; + } + + int GetBindingsOffset() { + return (binding_num_ / max_profile_num_) * GetProfileIndex(); + } + + int GetNbBindings() { return binding_num_; } + void ResetContext() { std::unique_lock lock(mutex_); const std::thread::id tid = std::this_thread::get_id(); @@ -322,6 +344,7 @@ class TensorRTEngine { "generating serialization file and doing inference are " "consistent.")); + binding_num_ = infer_engine_->getNbBindings(); GetEngineInfo(); } @@ -540,6 +563,7 @@ class TensorRTEngine { } } + void SetProfileNum(int num) { max_profile_num_ = num; } void GetEngineInfo() { #if IS_TRT_VERSION_GE(8200) std::unique_ptr infer_inspector( @@ -571,6 +595,9 @@ class TensorRTEngine { int batch_size_{-1}; int device_id_; + int max_profile_num_{1}; + int profile_num_{0}; + std::unordered_map profile_index_; ShapeMapType min_input_shape_; ShapeMapType max_input_shape_; ShapeMapType optim_input_shape_; @@ -614,8 +641,9 @@ class TensorRTEngine { // For dynamic shape bool with_dynamic_shape_{false}; #if IS_TRT_VERSION_GE(6000) + int binding_num_; infer_ptr infer_builder_config_; - nvinfer1::IOptimizationProfile* optim_profile_; + std::vector optim_profiles_; std::vector> owned_pluginv2_; #endif std::mutex mutex_; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 35612905f8569..e4687b0cca9c9 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -250,6 +250,23 @@ class TensorRTEngineOp : public framework::OperatorBase { } } + void PrepareTRTEngine(const framework::Scope &scope, + TensorRTEngine *engine) const { + LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " + "kernel etc). This process may cost a lot of time."; + framework::proto::BlockDesc block_proto; + block_proto.ParseFromString(Attr("subgraph")); + framework::BlockDesc block_desc(nullptr, &block_proto); + + std::vector inputs = Inputs("Xs"); + std::vector outputs = + Attr>("output_name_mapping"); + + inference::Singleton::Global() + .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_, + outputs, engine); + } + protected: void RunNativeImpl(const framework::Scope &scope, const platform::Place &dev_place) const { @@ -414,8 +431,19 @@ class TensorRTEngineOp : public framework::OperatorBase { int num_inputs = 0; num_inputs += runtime_input_names_.size(); - const int num_bindings = num_inputs + Outputs("Ys").size(); - std::vector buffers(num_bindings); + // const int num_bindings = num_inputs + Outputs("Ys").size(); + // std::vector buffers(num_bindings); + // This method returns the total over all profiles. + const int num_bindings = engine->GetNbBindings(); + std::vector buffers(num_bindings, nullptr); + + int binding_offset = 0; + nvinfer1::IExecutionContext *trt_context = nullptr; + if (engine->with_dynamic_shape()) { + // Initilize context and get offset by profile index + trt_context = engine->context(); + binding_offset = engine->GetBindingsOffset(); + } // Bind input tensor to TRT. for (const auto &x : runtime_input_names_) { @@ -430,7 +458,10 @@ class TensorRTEngineOp : public framework::OperatorBase { t.ShareDataWith(out); } auto t_shape = framework::vectorize(t.dims()); - const int bind_index = engine->engine()->getBindingIndex(x.c_str()); + // const int bind_index = engine->engine()->getBindingIndex(x.c_str()); + // Get index of profile 0 first, then plus binding offset + const int bind_index = + engine->engine()->getBindingIndex(x.c_str()) + binding_offset; PADDLE_ENFORCE_LT( bind_index, num_bindings, platform::errors::InvalidArgument( @@ -474,7 +505,6 @@ class TensorRTEngineOp : public framework::OperatorBase { } } else { #if IS_TRT_VERSION_GE(6000) - auto *trt_context = engine->context(); trt_context->setBindingDimensions( bind_index, inference::tensorrt::Vec2TRT_Dims(t_shape, x, true)); #endif @@ -511,7 +541,6 @@ class TensorRTEngineOp : public framework::OperatorBase { } } else { #if IS_TRT_VERSION_GE(6000) - auto *trt_context = engine->context(); auto dims = trt_context->getBindingDimensions(bind_index); int nb_dims = dims.nbDims; for (; nb_dims > 0; nb_dims--) { @@ -583,23 +612,6 @@ class TensorRTEngineOp : public framework::OperatorBase { } return trt_engine_; } - - void PrepareTRTEngine(const framework::Scope &scope, - TensorRTEngine *engine) const { - LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " - "kernel etc). This process may cost a lot of time."; - framework::proto::BlockDesc block_proto; - block_proto.ParseFromString(Attr("subgraph")); - framework::BlockDesc block_desc(nullptr, &block_proto); - - std::vector inputs = Inputs("Xs"); - std::vector outputs = - Attr>("output_name_mapping"); - - inference::Singleton::Global() - .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_, - outputs, engine); - } }; } // namespace operators From e8914794ed490527ef1c60642ba5274febf0d684 Mon Sep 17 00:00:00 2001 From: wenbin Date: Wed, 22 Dec 2021 12:00:55 +0000 Subject: [PATCH 02/12] trt --- paddle/fluid/framework/naive_executor.cc | 1 + paddle/fluid/inference/api/analysis_predictor.cc | 1 + paddle/fluid/inference/api/analysis_predictor.h | 2 +- paddle/fluid/inference/tensorrt/engine.cc | 5 +++-- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index e7dc4eab74a2f..3d15fe4382719 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -151,6 +151,7 @@ void NaiveExecutor::ResetTrtOps(int num) { inference::tensorrt::TRTEngineManager>::Global() .Get(engine_name); if (trt_engine->with_dynamic_shape()) { + LOG(INFO) << "rebuild trt engine!"; trt_engine->ResetContext(); trt_engine->ClearTensorMap(); trt_engine->SetProfileNum(num); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 5d5719533e7a7..acf1d54afe756 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1332,6 +1332,7 @@ std::unique_ptr AnalysisPredictor::Clone() { std::lock_guard lk(clone_mutex_); auto *x = new AnalysisPredictor(config_); x->Init(scope_, inference_program_); + x->executor_->ResetTrtOps(++x->clone_num_); return std::unique_ptr(x); } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 4c7b8e081300d..8edd459202897 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -429,7 +429,7 @@ class AnalysisPredictor : public PaddlePredictor { bool status_is_cloned_{false}; std::map>> shape_info_; - int clone_num{1}; + int clone_num_{1}; }; } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 669eb165614a7..0e5032f0f935c 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -221,6 +221,7 @@ void TensorRTEngine::FreezeNetwork() { << " min: " << Vec2Str(input.second) << ", max: " << Vec2Str(max_input_shape_[input.first]) << ", opt: " << Vec2Str(optim_input_shape_[input.first]); + optim_profiles_[i]->setDimensions( input.first.c_str(), nvinfer1::OptProfileSelector::kMIN, Vec2TRT_Dims(input.second, input.first, true)); @@ -241,9 +242,7 @@ void TensorRTEngine::FreezeNetwork() { "opt_shape, false /*disable_trt_plugin_fp16*/)'"; } #endif - binding_num_ = infer_engine_->getNbBindings(); } - #if IS_TRT_VERSION_GE(8200) infer_builder_config_->setProfilingVerbosity( nvinfer1::ProfilingVerbosity::kDETAILED); @@ -266,6 +265,8 @@ void TensorRTEngine::FreezeNetwork() { "Build TensorRT cuda engine failed! Please recheck " "you configurations related to paddle-TensorRT.")); + binding_num_ = infer_engine_->getNbBindings(); + GetEngineInfo(); } From bb0283498ce7c14617a7963b09e6f99c8131c9a1 Mon Sep 17 00:00:00 2001 From: wenbin Date: Mon, 27 Dec 2021 13:11:46 +0000 Subject: [PATCH 03/12] tensor fix --- paddle/fluid/framework/naive_executor.cc | 2 +- .../analysis/passes/ir_params_sync_among_devices_pass.cc | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 3d15fe4382719..9bd6aba3ea842 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -151,7 +151,7 @@ void NaiveExecutor::ResetTrtOps(int num) { inference::tensorrt::TRTEngineManager>::Global() .Get(engine_name); if (trt_engine->with_dynamic_shape()) { - LOG(INFO) << "rebuild trt engine!"; + LOG(INFO) << "rebuild trt engine, this may cost a lot of time!"; trt_engine->ResetContext(); trt_engine->ClearTensorMap(); trt_engine->SetProfileNum(num); diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 9993bb37d5140..8be4c2f521242 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -56,8 +56,11 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { // Because there exists the case that new parameter variables are not added to // the program in the analysis pass. bool reserve_cpu_weights = false; - if (argument->tensorrt_allow_build_at_runtime_valid() && - argument->tensorrt_allow_build_at_runtime()) { + bool with_dynamic_shape = (argument->max_input_shape().size() > 0 && + argument->min_input_shape().size() > 0 && + argument->optim_input_shape().size() > 0) || + argument->tensorrt_tuned_dynamic_shape(); + if (with_dynamic_shape) { reserve_cpu_weights = true; } for (auto &var_name : all_vars) { From 6d798208ba67d1c5a903193c5d0b307825def2da Mon Sep 17 00:00:00 2001 From: wenbin Date: Tue, 28 Dec 2021 04:20:53 +0000 Subject: [PATCH 04/12] rebuild trt --- paddle/fluid/inference/tensorrt/engine.cc | 5 +++++ paddle/fluid/inference/tensorrt/engine.h | 10 +++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 0e5032f0f935c..2a35f497ed07f 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -266,6 +266,11 @@ void TensorRTEngine::FreezeNetwork() { "you configurations related to paddle-TensorRT.")); binding_num_ = infer_engine_->getNbBindings(); + // reset status for dynamic shape clone + if (max_profile_num_ > 1) { + infer_context_.clear(); + cur_profile_num_ = 0; + } GetEngineInfo(); } diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 5203d5699150d..5b1f52372ce76 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -256,11 +256,11 @@ class TensorRTEngine { infer_context_[tid].reset(infer_engine_->createExecutionContext()); if (with_dynamic_shape_) { // need new profile if it's not the first - if (profile_num_ > 0) { - infer_context_[tid]->setOptimizationProfile(profile_num_); + if (cur_profile_num_ > 0) { + infer_context_[tid]->setOptimizationProfile(cur_profile_num_); } - profile_index_[tid] = profile_num_; - ++profile_num_; + profile_index_[tid] = cur_profile_num_; + ++cur_profile_num_; } } return infer_context_[tid].get(); @@ -596,7 +596,7 @@ class TensorRTEngine { int device_id_; int max_profile_num_{1}; - int profile_num_{0}; + int cur_profile_num_{0}; std::unordered_map profile_index_; ShapeMapType min_input_shape_; ShapeMapType max_input_shape_; From 63658c4ed6f3e5aff08b7c0089c8901af22ef02e Mon Sep 17 00:00:00 2001 From: wenbin Date: Tue, 28 Dec 2021 05:39:44 +0000 Subject: [PATCH 05/12] profile bug fix --- paddle/fluid/inference/tensorrt/engine.h | 14 ++++++++++---- .../fluid/operators/tensorrt/tensorrt_engine_op.h | 3 ++- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 5b1f52372ce76..7aaeb739de194 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -253,6 +253,9 @@ class TensorRTEngine { infer_engine_, platform::errors::InvalidArgument( "You should build engine first and then set the context.")); + // We may see trt warning: Profile 0 has been chosen by another + // IExecutionContext... + // It's ok. We will set it later. infer_context_[tid].reset(infer_engine_->createExecutionContext()); if (with_dynamic_shape_) { // need new profile if it's not the first @@ -267,10 +270,13 @@ class TensorRTEngine { } int GetProfileIndex() { - // re-think!!! - std::unique_lock lock(mutex_); - const std::thread::id tid = std::this_thread::get_id(); - return profile_index_[tid]; + if (max_profile_num_ > 1) { + std::unique_lock lock(mutex_); + const std::thread::id tid = std::this_thread::get_id(); + return profile_index_[tid]; + } else { + return 0; + } } int GetBindingsOffset() { diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index e4687b0cca9c9..1c1f63331d056 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -530,7 +530,8 @@ class TensorRTEngineOp : public framework::OperatorBase { VLOG(4) << "TensorRT Engine Op Outputs:"; for (const auto &y : Outputs("Ys")) { const int bind_index = - engine->engine()->getBindingIndex(output_maps[output_index].c_str()); + engine->engine()->getBindingIndex(output_maps[output_index].c_str()) + + binding_offset; std::vector ddim; if (!engine->with_dynamic_shape()) { From 0f00d05a9a7be0f5d73fd11ea8dc854bfa71a780 Mon Sep 17 00:00:00 2001 From: wenbin Date: Tue, 28 Dec 2021 06:27:54 +0000 Subject: [PATCH 06/12] ci test --- paddle/fluid/inference/api/CMakeLists.txt | 4 ++-- paddle/fluid/inference/tensorrt/CMakeLists.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 53b92c1336302..f322a36a1f69c 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -65,10 +65,10 @@ if(WITH_TESTING) endif() if (NOT APPLE AND NOT WIN32) - cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS paddle_inference_shared + cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR}) elseif (WIN32) - cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps} + cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps} ARGS --dirname=${WORD2VEC_MODEL_DIR}) endif() diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index d1d146b2ce5f6..b5ae120e33896 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -1,8 +1,8 @@ # Compiling with WITH_PYTHON=ON and WITH_TENSORRT=ON failed on windows. Temporarily add paddle_inference_api dependency to solve the problem if(WIN32) -nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost paddle_inference_api) +nv_library(tensorrt_engine SRCS trt_int8_calibrator.cc engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost paddle_inference_api) else() -nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost) +nv_library(tensorrt_engine SRCS trt_int8_calibrator.cc engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost) endif() nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto device_context boost) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) From eda2fd6aa673b68baa6ef85714671966831c58ea Mon Sep 17 00:00:00 2001 From: wenbin Date: Tue, 28 Dec 2021 06:39:32 +0000 Subject: [PATCH 07/12] argument fix --- .../passes/ir_params_sync_among_devices_pass.cc | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 8be4c2f521242..516e3d9c5c1f9 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -56,10 +56,15 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { // Because there exists the case that new parameter variables are not added to // the program in the analysis pass. bool reserve_cpu_weights = false; - bool with_dynamic_shape = (argument->max_input_shape().size() > 0 && - argument->min_input_shape().size() > 0 && - argument->optim_input_shape().size() > 0) || - argument->tensorrt_tuned_dynamic_shape(); + bool with_dynamic_shape = false; + if (argument->Has("max_input_shape") && argument->Has("min_input_shape") && + argument->Has("optim_input_shape")) { + with_dynamic_shape = (argument->max_input_shape().size() > 0 && + argument->min_input_shape().size() > 0 && + argument->optim_input_shape().size() > 0) + } + with_dynamic_shape = + with_dynamic_shape || argument->tensorrt_tuned_dynamic_shape(); if (with_dynamic_shape) { reserve_cpu_weights = true; } From 61c39d8b1faf71a5a18fd37b2a10c2f63486f2e4 Mon Sep 17 00:00:00 2001 From: wenbin Date: Tue, 28 Dec 2021 06:49:51 +0000 Subject: [PATCH 08/12] miss comma --- .../analysis/passes/ir_params_sync_among_devices_pass.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 516e3d9c5c1f9..7be781cb5e03a 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -61,7 +61,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { argument->Has("optim_input_shape")) { with_dynamic_shape = (argument->max_input_shape().size() > 0 && argument->min_input_shape().size() > 0 && - argument->optim_input_shape().size() > 0) + argument->optim_input_shape().size() > 0); } with_dynamic_shape = with_dynamic_shape || argument->tensorrt_tuned_dynamic_shape(); From 57dba8bfbddbcac053cc8de13f5e5504bc687733 Mon Sep 17 00:00:00 2001 From: wenbin Date: Tue, 28 Dec 2021 07:35:26 +0000 Subject: [PATCH 09/12] makefile --- paddle/fluid/framework/CMakeLists.txt | 4 ++++ paddle/fluid/inference/tensorrt/CMakeLists.txt | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index d8051e1fbb116..94461f0d941a4 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -274,7 +274,11 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) +if (TENSORRT_FOUND) +cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry denormal device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper tensorrt_engine_op) +else() cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry denormal device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) +endif(TENSORRT_FOUND) cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector op_registry while_op_helper recurrent_op_helper conditional_block_op_helper) if(WITH_DISTRIBUTE) diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index b5ae120e33896..d1d146b2ce5f6 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -1,8 +1,8 @@ # Compiling with WITH_PYTHON=ON and WITH_TENSORRT=ON failed on windows. Temporarily add paddle_inference_api dependency to solve the problem if(WIN32) -nv_library(tensorrt_engine SRCS trt_int8_calibrator.cc engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost paddle_inference_api) +nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost paddle_inference_api) else() -nv_library(tensorrt_engine SRCS trt_int8_calibrator.cc engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost) +nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost) endif() nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto device_context boost) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) From 20ac825f5b7ccb58f2cd16d0ee330bb79d5b3edd Mon Sep 17 00:00:00 2001 From: wenbin Date: Tue, 28 Dec 2021 08:14:45 +0000 Subject: [PATCH 10/12] fix tuned --- .../analysis/passes/ir_params_sync_among_devices_pass.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 7be781cb5e03a..8bb08b6fdaf2a 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -64,7 +64,8 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { argument->optim_input_shape().size() > 0); } with_dynamic_shape = - with_dynamic_shape || argument->tensorrt_tuned_dynamic_shape(); + with_dynamic_shape || (argument->Has("tensorrt_tuned_dynamic_shape") && + argument->tensorrt_tuned_dynamic_shape()); if (with_dynamic_shape) { reserve_cpu_weights = true; } From 33153521d77b81276eea84de0b94dcb89937b1cc Mon Sep 17 00:00:00 2001 From: wenbin Date: Wed, 29 Dec 2021 04:05:09 +0000 Subject: [PATCH 11/12] add ut --- paddle/fluid/inference/api/CMakeLists.txt | 4 +- .../tests/api/trt_dynamic_shape_test.cc | 82 +++++++++++++++++++ 2 files changed, 84 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index f322a36a1f69c..53b92c1336302 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -65,10 +65,10 @@ if(WITH_TESTING) endif() if (NOT APPLE AND NOT WIN32) - cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS paddle_inference_shared + cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR}) elseif (WIN32) - cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps} + cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps} ARGS --dirname=${WORD2VEC_MODEL_DIR}) endif() diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc index 4f6742b88b28c..11aee2a4a4749 100644 --- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc @@ -207,6 +207,87 @@ void TestTunedDynamic() { check_func(test_predictor.get()); } +void TestDynamicClone(bool with_dynamic = true, bool delete_cache = true, + bool delete_conv_bn = false) { + std::string model_dir = + FLAGS_infer_model + "/conv_bn_swish_split_gelu/conv_bn_swish_split_gelu"; + + std::string opt_cache_dir = model_dir + "/my_cache"; + if (delete_cache) { + delete_cache_files(opt_cache_dir); + } + + AnalysisConfig config; + config.EnableUseGpu(100, 0); + std::string buffer_prog, buffer_param; + ReadBinaryFile(model_dir + "/model", &buffer_prog); + ReadBinaryFile(model_dir + "/params", &buffer_param); + config.SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0], + buffer_param.size()); + config.SetOptimCacheDir(opt_cache_dir); + + config.SwitchUseFeedFetchOps(false); + // Set the input's min, max, opt shape + config.EnableTensorRtEngine(1 << 30, 1, 1, + AnalysisConfig::Precision::kFloat32, true, true); + if (delete_conv_bn) { + config.pass_builder()->DeletePass("conv_bn_fuse_pass"); + } + if (with_dynamic) { + std::map> min_input_shape = { + {"image", {1, 1, 3, 3}}}; + std::map> max_input_shape = { + {"image", {1, 1, 10, 10}}}; + std::map> opt_input_shape = { + {"image", {1, 1, 3, 3}}}; + + config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, + opt_input_shape); + } + auto predictor = CreatePaddlePredictor(config); + auto input_names = predictor->GetInputNames(); + int channels = 1; + int height = 3; + int width = 3; + int input_num = channels * height * width * 1; + + float *input = new float[input_num]; + memset(input, 0, input_num * sizeof(float)); + auto input_t = predictor->GetInputTensor(input_names[0]); + input_t->Reshape({1, channels, height, width}); + input_t->copy_from_cpu(input); + + ASSERT_TRUE(predictor->ZeroCopyRun()); + + std::vector out_data; + auto output_names = predictor->GetOutputNames(); + auto output_t = predictor->GetOutputTensor(output_names[0]); + std::vector output_shape = output_t->shape(); + int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + out_data.resize(out_num); + output_t->copy_to_cpu(out_data.data()); + + auto predictor2 = predictor->Clone(); + auto input_t2 = predictor2->GetInputTensor(input_names[0]); + input_t2->Reshape({1, channels, height, width}); + input_t2->copy_from_cpu(input); + + ASSERT_TRUE(predictor2->ZeroCopyRun()); + + std::vector out_data2; + auto output_t2 = predictor2->GetOutputTensor(output_names[0]); + std::vector output_shape2 = output_t2->shape(); + int out_num2 = std::accumulate(output_shape2.begin(), output_shape2.end(), 1, + std::multiplies()); + out_data2.resize(out_num2); + output_t2->copy_to_cpu(out_data2.data()); + ASSERT_TRUE(out_data2.size() == out_data.size()); + for (size_t i = 0; i < out_data.size(); i++) { + EXPECT_NEAR(out_data2[i], out_data[i], 1e-5); + } +} + TEST(AnalysisPredictor, trt_dynamic) { TestDynamic(true); } TEST(AnalysisPredictor, trt_static) { TestDynamic(false); } TEST(AnalysisPredictor, trt_memory_serialize) { @@ -218,6 +299,7 @@ TEST(AnalysisPredictor, trt_memory_serialize) { TEST(AnalysisPredictor, trt_dynamic2) { TestDynamic2(); } TEST(AnalysisPredictor, trt_tuned_dynamic) { TestTunedDynamic(); } +TEST(AnalysisPredictor, trt_dynamic_clone) { TestDynamicClone(); } } // namespace inference } // namespace paddle From 3dfed217a7034ea458ebfc4f8b585f0178226581 Mon Sep 17 00:00:00 2001 From: wenbin Date: Wed, 29 Dec 2021 05:06:13 +0000 Subject: [PATCH 12/12] false --- paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc index 11aee2a4a4749..ccdf237ffa54d 100644 --- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc @@ -228,8 +228,8 @@ void TestDynamicClone(bool with_dynamic = true, bool delete_cache = true, config.SwitchUseFeedFetchOps(false); // Set the input's min, max, opt shape - config.EnableTensorRtEngine(1 << 30, 1, 1, - AnalysisConfig::Precision::kFloat32, true, true); + config.EnableTensorRtEngine( + 1 << 30, 1, 1, AnalysisConfig::Precision::kFloat32, false, false); if (delete_conv_bn) { config.pass_builder()->DeletePass("conv_bn_fuse_pass"); }