From 51462f7dcadcfe94ceaf6fcead2943a84bb9b19a Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Tue, 21 Dec 2021 04:57:44 +0000
Subject: [PATCH 01/12] compile done

---
 paddle/fluid/framework/naive_executor.cc      | 35 +++++++++++
 paddle/fluid/framework/naive_executor.h       |  2 +
 .../fluid/inference/api/analysis_predictor.h  |  1 +
 paddle/fluid/inference/tensorrt/engine.cc     | 60 ++++++++++---------
 paddle/fluid/inference/tensorrt/engine.h      | 30 +++++++++-
 .../operators/tensorrt/tensorrt_engine_op.h   | 56 ++++++++++-------
 6 files changed, 134 insertions(+), 50 deletions(-)
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 7d55d8c41e3e9..e7dc4eab74a2f 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -20,6 +20,9 @@
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
+#if PADDLE_WITH_TENSORRT
+#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -132,5 +135,37 @@ NaiveExecutor::~NaiveExecutor() {
 #endif
 }
 
+void NaiveExecutor::ResetTrtOps(int num) {
+#if PADDLE_WITH_TENSORRT
+  for (auto &op : ops_) {
+    if (op->Type() == "tensorrt_engine") {
+      operators::TensorRTEngineOp *trtop =
+          dynamic_cast<operators::TensorRTEngineOp *>(op.get());
+      if (!trtop) return;
+      std::string engine_key = trtop->Attr<std::string>("engine_key");
+      int engine_predictor_id = trtop->Attr<int>("predictor_id");
+      std::string engine_name =
+          engine_key + std::to_string(engine_predictor_id);
+      operators::TensorRTEngine *trt_engine =
+          paddle::inference::Singleton<
+              inference::tensorrt::TRTEngineManager>::Global()
+              .Get(engine_name);
+      if (trt_engine->with_dynamic_shape()) {
+        trt_engine->ResetContext();
+        trt_engine->ClearTensorMap();
+        trt_engine->SetProfileNum(num);
+        auto *anc = scope_->parent();
+        while (anc && anc->parent()) {
+          anc = anc->parent();
+        }
+        if (anc == nullptr) {
+          anc = scope_;
+        }
+        trtop->PrepareTRTEngine(*anc, trt_engine);
+      }
+    }
+  }
+#endif
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index f38632a9a639c..ed475e66f626d 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -63,6 +63,8 @@ class NaiveExecutor {
 
   void CleanFeedFetchOps();
 
+  void ResetTrtOps(int num);
+
  protected:
   void CreateOps(const ProgramDesc& desc, int block_id,
                  bool with_feed_fetch_ops);
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 9c36051757527..4c7b8e081300d 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -429,6 +429,7 @@ class AnalysisPredictor : public PaddlePredictor {
   bool status_is_cloned_{false};
 
   std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
+  int clone_num{1};
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index cb815e00c4430..669eb165614a7 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -42,7 +42,10 @@ void TensorRTEngine::InitNetwork() {
   }
 
   infer_builder_config_.reset(infer_builder_->createBuilderConfig());
-  optim_profile_ = infer_builder_->createOptimizationProfile();
+  // optim_profile_ = infer_builder_->createOptimizationProfile();
+  optim_profiles_.resize(max_profile_num_);
+  for (int i = 0; i < max_profile_num_; i++)
+    optim_profiles_[i] = infer_builder_->createOptimizationProfile();
 }
 
 void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
@@ -199,35 +202,37 @@ void TensorRTEngine::FreezeNetwork() {
   if (with_dynamic_shape_) {
 #if IS_TRT_VERSION_GE(6000)
     LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode.";
-    for (auto &input : min_input_shape_) {
+    for (int i = 0; i < max_profile_num_; i++) {
+      for (auto &input : min_input_shape_) {
 #if IS_TRT_VERSION_LT(7000)
-      // trt6 will check all_of input > 0
-      if (!(std::all_of(input.second.begin(), input.second.end(),
-                        [](int x) { return x > 0; }) &&
-            std::all_of(max_input_shape_[input.first].begin(),
-                        max_input_shape_[input.first].end(),
-                        [](int x) { return x > 0; }) &&
-            std::all_of(optim_input_shape_[input.first].begin(),
-                        optim_input_shape_[input.first].end(),
-                        [](int x) { return x > 0; }))) {
-        continue;
-      }
+        // trt6 will check all_of input > 0
+        if (!(std::all_of(input.second.begin(), input.second.end(),
+                          [](int x) { return x > 0; }) &&
+              std::all_of(max_input_shape_[input.first].begin(),
+                          max_input_shape_[input.first].end(),
+                          [](int x) { return x > 0; }) &&
+              std::all_of(optim_input_shape_[input.first].begin(),
+                          optim_input_shape_[input.first].end(),
+                          [](int x) { return x > 0; }))) {
+          continue;
+        }
 #endif
-      VLOG(4) << "TRT dynamic_shape set " << input.first
-              << " min: " << Vec2Str(input.second)
-              << ", max: " << Vec2Str(max_input_shape_[input.first])
-              << ", opt: " << Vec2Str(optim_input_shape_[input.first]);
-      optim_profile_->setDimensions(
-          input.first.c_str(), nvinfer1::OptProfileSelector::kMIN,
-          Vec2TRT_Dims(input.second, input.first, true));
-      optim_profile_->setDimensions(
-          input.first.c_str(), nvinfer1::OptProfileSelector::kMAX,
-          Vec2TRT_Dims(max_input_shape_[input.first], input.first, true));
-      optim_profile_->setDimensions(
-          input.first.c_str(), nvinfer1::OptProfileSelector::kOPT,
-          Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true));
+        VLOG(4) << "TRT dynamic_shape set " << input.first
+                << " min: " << Vec2Str(input.second)
+                << ", max: " << Vec2Str(max_input_shape_[input.first])
+                << ", opt: " << Vec2Str(optim_input_shape_[input.first]);
+        optim_profiles_[i]->setDimensions(
+            input.first.c_str(), nvinfer1::OptProfileSelector::kMIN,
+            Vec2TRT_Dims(input.second, input.first, true));
+        optim_profiles_[i]->setDimensions(
+            input.first.c_str(), nvinfer1::OptProfileSelector::kMAX,
+            Vec2TRT_Dims(max_input_shape_[input.first], input.first, true));
+        optim_profiles_[i]->setDimensions(
+            input.first.c_str(), nvinfer1::OptProfileSelector::kOPT,
+            Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true));
+      }
+      infer_builder_config_->addOptimizationProfile(optim_profiles_[i]);
     }
-    infer_builder_config_->addOptimizationProfile(optim_profile_);
     if (WithFp16() && disable_trt_plugin_fp16()) {
       LOG(INFO) << "NOTE: In order to achieve higher accuracy, you have "
                    "disabled the fp16 mode of TRT Plugin,\n"
@@ -236,6 +241,7 @@ void TensorRTEngine::FreezeNetwork() {
                    "opt_shape, false /*disable_trt_plugin_fp16*/)'";
     }
 #endif
+    binding_num_ = infer_engine_->getNbBindings();
   }
 
 #if IS_TRT_VERSION_GE(8200)
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index e6f58c8c8e8f4..5203d5699150d 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -254,9 +254,31 @@ class TensorRTEngine {
           platform::errors::InvalidArgument(
               "You should build engine first and then set the context."));
       infer_context_[tid].reset(infer_engine_->createExecutionContext());
+      if (with_dynamic_shape_) {
+        // need new profile if it's not the first
+        if (profile_num_ > 0) {
+          infer_context_[tid]->setOptimizationProfile(profile_num_);
+        }
+        profile_index_[tid] = profile_num_;
+        ++profile_num_;
+      }
     }
     return infer_context_[tid].get();
   }
+
+  int GetProfileIndex() {
+    // re-think!!!
+    std::unique_lock<std::mutex> lock(mutex_);
+    const std::thread::id tid = std::this_thread::get_id();
+    return profile_index_[tid];
+  }
+
+  int GetBindingsOffset() {
+    return (binding_num_ / max_profile_num_) * GetProfileIndex();
+  }
+
+  int GetNbBindings() { return binding_num_; }
+
   void ResetContext() {
     std::unique_lock<std::mutex> lock(mutex_);
     const std::thread::id tid = std::this_thread::get_id();
@@ -322,6 +344,7 @@ class TensorRTEngine {
             "generating serialization file and doing inference are "
             "consistent."));
 
+    binding_num_ = infer_engine_->getNbBindings();
     GetEngineInfo();
   }
 
@@ -540,6 +563,7 @@ class TensorRTEngine {
     }
   }
 
+  void SetProfileNum(int num) { max_profile_num_ = num; }
   void GetEngineInfo() {
 #if IS_TRT_VERSION_GE(8200)
     std::unique_ptr<nvinfer1::IEngineInspector> infer_inspector(
@@ -571,6 +595,9 @@ class TensorRTEngine {
   int batch_size_{-1};
 
   int device_id_;
+  int max_profile_num_{1};
+  int profile_num_{0};
+  std::unordered_map<std::thread::id, int> profile_index_;
   ShapeMapType min_input_shape_;
   ShapeMapType max_input_shape_;
   ShapeMapType optim_input_shape_;
@@ -614,8 +641,9 @@ class TensorRTEngine {
   // For dynamic shape
   bool with_dynamic_shape_{false};
 #if IS_TRT_VERSION_GE(6000)
+  int binding_num_;
   infer_ptr<nvinfer1::IBuilderConfig> infer_builder_config_;
-  nvinfer1::IOptimizationProfile* optim_profile_;
+  std::vector<nvinfer1::IOptimizationProfile*> optim_profiles_;
   std::vector<std::unique_ptr<plugin::DynamicPluginTensorRT>> owned_pluginv2_;
 #endif
   std::mutex mutex_;
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 35612905f8569..e4687b0cca9c9 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -250,6 +250,23 @@ class TensorRTEngineOp : public framework::OperatorBase {
     }
   }
 
+  void PrepareTRTEngine(const framework::Scope &scope,
+                        TensorRTEngine *engine) const {
+    LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
+                 "kernel etc). This process may cost a lot of time.";
+    framework::proto::BlockDesc block_proto;
+    block_proto.ParseFromString(Attr<std::string>("subgraph"));
+    framework::BlockDesc block_desc(nullptr, &block_proto);
+
+    std::vector<std::string> inputs = Inputs("Xs");
+    std::vector<std::string> outputs =
+        Attr<std::vector<std::string>>("output_name_mapping");
+
+    inference::Singleton<inference::tensorrt::OpConverter>::Global()
+        .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_,
+                                 outputs, engine);
+  }
+
  protected:
   void RunNativeImpl(const framework::Scope &scope,
                      const platform::Place &dev_place) const {
@@ -414,8 +431,19 @@ class TensorRTEngineOp : public framework::OperatorBase {
     int num_inputs = 0;
 
     num_inputs += runtime_input_names_.size();
-    const int num_bindings = num_inputs + Outputs("Ys").size();
-    std::vector<void *> buffers(num_bindings);
+    //  const int num_bindings = num_inputs + Outputs("Ys").size();
+    //  std::vector<void *> buffers(num_bindings);
+    // This method returns the total over all profiles.
+    const int num_bindings = engine->GetNbBindings();
+    std::vector<void *> buffers(num_bindings, nullptr);
+
+    int binding_offset = 0;
+    nvinfer1::IExecutionContext *trt_context = nullptr;
+    if (engine->with_dynamic_shape()) {
+      // Initilize context and get offset by profile index
+      trt_context = engine->context();
+      binding_offset = engine->GetBindingsOffset();
+    }
 
     // Bind input tensor to TRT.
     for (const auto &x : runtime_input_names_) {
@@ -430,7 +458,10 @@ class TensorRTEngineOp : public framework::OperatorBase {
         t.ShareDataWith(out);
       }
       auto t_shape = framework::vectorize<int64_t>(t.dims());
-      const int bind_index = engine->engine()->getBindingIndex(x.c_str());
+      // const int bind_index = engine->engine()->getBindingIndex(x.c_str());
+      // Get index of profile 0 first, then plus binding offset
+      const int bind_index =
+          engine->engine()->getBindingIndex(x.c_str()) + binding_offset;
       PADDLE_ENFORCE_LT(
           bind_index, num_bindings,
           platform::errors::InvalidArgument(
@@ -474,7 +505,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
         }
       } else {
 #if IS_TRT_VERSION_GE(6000)
-        auto *trt_context = engine->context();
         trt_context->setBindingDimensions(
             bind_index, inference::tensorrt::Vec2TRT_Dims(t_shape, x, true));
 #endif
@@ -511,7 +541,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
         }
       } else {
 #if IS_TRT_VERSION_GE(6000)
-        auto *trt_context = engine->context();
         auto dims = trt_context->getBindingDimensions(bind_index);
         int nb_dims = dims.nbDims;
         for (; nb_dims > 0; nb_dims--) {
@@ -583,23 +612,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
     }
     return trt_engine_;
   }
-
-  void PrepareTRTEngine(const framework::Scope &scope,
-                        TensorRTEngine *engine) const {
-    LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
-                 "kernel etc). This process may cost a lot of time.";
-    framework::proto::BlockDesc block_proto;
-    block_proto.ParseFromString(Attr<std::string>("subgraph"));
-    framework::BlockDesc block_desc(nullptr, &block_proto);
-
-    std::vector<std::string> inputs = Inputs("Xs");
-    std::vector<std::string> outputs =
-        Attr<std::vector<std::string>>("output_name_mapping");
-
-    inference::Singleton<inference::tensorrt::OpConverter>::Global()
-        .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_,
-                                 outputs, engine);
-  }
 };
 
 }  // namespace operators

From e8914794ed490527ef1c60642ba5274febf0d684 Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Wed, 22 Dec 2021 12:00:55 +0000
Subject: [PATCH 02/12] trt

---
 paddle/fluid/framework/naive_executor.cc         | 1 +
 paddle/fluid/inference/api/analysis_predictor.cc | 1 +
 paddle/fluid/inference/api/analysis_predictor.h  | 2 +-
 paddle/fluid/inference/tensorrt/engine.cc        | 5 +++--
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index e7dc4eab74a2f..3d15fe4382719 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -151,6 +151,7 @@ void NaiveExecutor::ResetTrtOps(int num) {
               inference::tensorrt::TRTEngineManager>::Global()
               .Get(engine_name);
       if (trt_engine->with_dynamic_shape()) {
+        LOG(INFO) << "rebuild trt engine!";
         trt_engine->ResetContext();
         trt_engine->ClearTensorMap();
         trt_engine->SetProfileNum(num);
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 5d5719533e7a7..acf1d54afe756 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1332,6 +1332,7 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
   std::lock_guard<std::mutex> lk(clone_mutex_);
   auto *x = new AnalysisPredictor(config_);
   x->Init(scope_, inference_program_);
+  x->executor_->ResetTrtOps(++x->clone_num_);
   return std::unique_ptr<PaddlePredictor>(x);
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 4c7b8e081300d..8edd459202897 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -429,7 +429,7 @@ class AnalysisPredictor : public PaddlePredictor {
   bool status_is_cloned_{false};
 
   std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
-  int clone_num{1};
+  int clone_num_{1};
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 669eb165614a7..0e5032f0f935c 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -221,6 +221,7 @@ void TensorRTEngine::FreezeNetwork() {
                 << " min: " << Vec2Str(input.second)
                 << ", max: " << Vec2Str(max_input_shape_[input.first])
                 << ", opt: " << Vec2Str(optim_input_shape_[input.first]);
+
         optim_profiles_[i]->setDimensions(
             input.first.c_str(), nvinfer1::OptProfileSelector::kMIN,
             Vec2TRT_Dims(input.second, input.first, true));
@@ -241,9 +242,7 @@ void TensorRTEngine::FreezeNetwork() {
                    "opt_shape, false /*disable_trt_plugin_fp16*/)'";
     }
 #endif
-    binding_num_ = infer_engine_->getNbBindings();
   }
-
 #if IS_TRT_VERSION_GE(8200)
   infer_builder_config_->setProfilingVerbosity(
       nvinfer1::ProfilingVerbosity::kDETAILED);
@@ -266,6 +265,8 @@ void TensorRTEngine::FreezeNetwork() {
                          "Build TensorRT cuda engine failed! Please recheck "
                          "you configurations related to paddle-TensorRT."));
 
+  binding_num_ = infer_engine_->getNbBindings();
+
   GetEngineInfo();
 }
 

From bb0283498ce7c14617a7963b09e6f99c8131c9a1 Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Mon, 27 Dec 2021 13:11:46 +0000
Subject: [PATCH 03/12] tensor fix

---
 paddle/fluid/framework/naive_executor.cc                   | 2 +-
 .../analysis/passes/ir_params_sync_among_devices_pass.cc   | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 3d15fe4382719..9bd6aba3ea842 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -151,7 +151,7 @@ void NaiveExecutor::ResetTrtOps(int num) {
               inference::tensorrt::TRTEngineManager>::Global()
               .Get(engine_name);
       if (trt_engine->with_dynamic_shape()) {
-        LOG(INFO) << "rebuild trt engine!";
+        LOG(INFO) << "rebuild trt engine, this may cost a lot of time!";
         trt_engine->ResetContext();
         trt_engine->ClearTensorMap();
         trt_engine->SetProfileNum(num);
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 9993bb37d5140..8be4c2f521242 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -56,8 +56,11 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
   // Because there exists the case that new parameter variables are not added to
   // the program in the analysis pass.
   bool reserve_cpu_weights = false;
-  if (argument->tensorrt_allow_build_at_runtime_valid() &&
-      argument->tensorrt_allow_build_at_runtime()) {
+  bool with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
+                             argument->min_input_shape().size() > 0 &&
+                             argument->optim_input_shape().size() > 0) ||
+                            argument->tensorrt_tuned_dynamic_shape();
+  if (with_dynamic_shape) {
     reserve_cpu_weights = true;
   }
   for (auto &var_name : all_vars) {

From 6d798208ba67d1c5a903193c5d0b307825def2da Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Tue, 28 Dec 2021 04:20:53 +0000
Subject: [PATCH 04/12] rebuild trt

---
 paddle/fluid/inference/tensorrt/engine.cc |  5 +++++
 paddle/fluid/inference/tensorrt/engine.h  | 10 +++++-----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 0e5032f0f935c..2a35f497ed07f 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -266,6 +266,11 @@ void TensorRTEngine::FreezeNetwork() {
                          "you configurations related to paddle-TensorRT."));
 
   binding_num_ = infer_engine_->getNbBindings();
+  // reset status for dynamic shape clone
+  if (max_profile_num_ > 1) {
+    infer_context_.clear();
+    cur_profile_num_ = 0;
+  }
 
   GetEngineInfo();
 }
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 5203d5699150d..5b1f52372ce76 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -256,11 +256,11 @@ class TensorRTEngine {
       infer_context_[tid].reset(infer_engine_->createExecutionContext());
       if (with_dynamic_shape_) {
         // need new profile if it's not the first
-        if (profile_num_ > 0) {
-          infer_context_[tid]->setOptimizationProfile(profile_num_);
+        if (cur_profile_num_ > 0) {
+          infer_context_[tid]->setOptimizationProfile(cur_profile_num_);
         }
-        profile_index_[tid] = profile_num_;
-        ++profile_num_;
+        profile_index_[tid] = cur_profile_num_;
+        ++cur_profile_num_;
       }
     }
     return infer_context_[tid].get();
@@ -596,7 +596,7 @@ class TensorRTEngine {
 
   int device_id_;
   int max_profile_num_{1};
-  int profile_num_{0};
+  int cur_profile_num_{0};
   std::unordered_map<std::thread::id, int> profile_index_;
   ShapeMapType min_input_shape_;
   ShapeMapType max_input_shape_;

From 63658c4ed6f3e5aff08b7c0089c8901af22ef02e Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Tue, 28 Dec 2021 05:39:44 +0000
Subject: [PATCH 05/12] profile bug fix

---
 paddle/fluid/inference/tensorrt/engine.h           | 14 ++++++++++----
 .../fluid/operators/tensorrt/tensorrt_engine_op.h  |  3 ++-
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 5b1f52372ce76..7aaeb739de194 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -253,6 +253,9 @@ class TensorRTEngine {
           infer_engine_,
           platform::errors::InvalidArgument(
               "You should build engine first and then set the context."));
+      // We may see trt warning: Profile 0 has been chosen by another
+      // IExecutionContext...
+      // It's ok. We will set it later.
       infer_context_[tid].reset(infer_engine_->createExecutionContext());
       if (with_dynamic_shape_) {
         // need new profile if it's not the first
@@ -267,10 +270,13 @@ class TensorRTEngine {
   }
 
   int GetProfileIndex() {
-    // re-think!!!
-    std::unique_lock<std::mutex> lock(mutex_);
-    const std::thread::id tid = std::this_thread::get_id();
-    return profile_index_[tid];
+    if (max_profile_num_ > 1) {
+      std::unique_lock<std::mutex> lock(mutex_);
+      const std::thread::id tid = std::this_thread::get_id();
+      return profile_index_[tid];
+    } else {
+      return 0;
+    }
   }
 
   int GetBindingsOffset() {
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index e4687b0cca9c9..1c1f63331d056 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -530,7 +530,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
     VLOG(4) << "TensorRT Engine Op Outputs:";
     for (const auto &y : Outputs("Ys")) {
       const int bind_index =
-          engine->engine()->getBindingIndex(output_maps[output_index].c_str());
+          engine->engine()->getBindingIndex(output_maps[output_index].c_str()) +
+          binding_offset;
       std::vector<int> ddim;
 
       if (!engine->with_dynamic_shape()) {

From 0f00d05a9a7be0f5d73fd11ea8dc854bfa71a780 Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Tue, 28 Dec 2021 06:27:54 +0000
Subject: [PATCH 06/12] ci test

---
 paddle/fluid/inference/api/CMakeLists.txt      | 4 ++--
 paddle/fluid/inference/tensorrt/CMakeLists.txt | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 53b92c1336302..f322a36a1f69c 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -65,10 +65,10 @@ if(WITH_TESTING)
 endif()
 
 if (NOT APPLE AND NOT WIN32)
-  cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS paddle_inference_shared
+  cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc  DEPS paddle_inference_shared
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
 elseif (WIN32)
-  cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
+  cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc  DEPS analysis_predictor benchmark ${inference_deps}
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
 endif()
 
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index d1d146b2ce5f6..b5ae120e33896 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,8 +1,8 @@
 # Compiling with WITH_PYTHON=ON and WITH_TENSORRT=ON failed on windows. Temporarily add paddle_inference_api dependency to solve the problem
 if(WIN32)
-nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost paddle_inference_api)
+nv_library(tensorrt_engine SRCS trt_int8_calibrator.cc engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost paddle_inference_api)
 else()
-nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost)
+nv_library(tensorrt_engine SRCS trt_int8_calibrator.cc engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost)
 endif()
 nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto device_context boost)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)

From eda2fd6aa673b68baa6ef85714671966831c58ea Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Tue, 28 Dec 2021 06:39:32 +0000
Subject: [PATCH 07/12] argument fix

---
 .../passes/ir_params_sync_among_devices_pass.cc     | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 8be4c2f521242..516e3d9c5c1f9 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -56,10 +56,15 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
   // Because there exists the case that new parameter variables are not added to
   // the program in the analysis pass.
   bool reserve_cpu_weights = false;
-  bool with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
-                             argument->min_input_shape().size() > 0 &&
-                             argument->optim_input_shape().size() > 0) ||
-                            argument->tensorrt_tuned_dynamic_shape();
+  bool with_dynamic_shape = false;
+  if (argument->Has("max_input_shape") && argument->Has("min_input_shape") &&
+      argument->Has("optim_input_shape")) {
+    with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
+                          argument->min_input_shape().size() > 0 &&
+                          argument->optim_input_shape().size() > 0)
+  }
+  with_dynamic_shape =
+      with_dynamic_shape || argument->tensorrt_tuned_dynamic_shape();
   if (with_dynamic_shape) {
     reserve_cpu_weights = true;
   }

From 61c39d8b1faf71a5a18fd37b2a10c2f63486f2e4 Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Tue, 28 Dec 2021 06:49:51 +0000
Subject: [PATCH 08/12] miss comma

---
 .../analysis/passes/ir_params_sync_among_devices_pass.cc        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 516e3d9c5c1f9..7be781cb5e03a 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -61,7 +61,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
       argument->Has("optim_input_shape")) {
     with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
                           argument->min_input_shape().size() > 0 &&
-                          argument->optim_input_shape().size() > 0)
+                          argument->optim_input_shape().size() > 0);
   }
   with_dynamic_shape =
       with_dynamic_shape || argument->tensorrt_tuned_dynamic_shape();

From 57dba8bfbddbcac053cc8de13f5e5504bc687733 Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Tue, 28 Dec 2021 07:35:26 +0000
Subject: [PATCH 09/12] makefile

---
 paddle/fluid/framework/CMakeLists.txt          | 4 ++++
 paddle/fluid/inference/tensorrt/CMakeLists.txt | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index d8051e1fbb116..94461f0d941a4 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -274,7 +274,11 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
 cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
 
+if (TENSORRT_FOUND)
+cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry denormal device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper tensorrt_engine_op)
+else()
 cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry denormal device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
+endif(TENSORRT_FOUND)
 
 cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector op_registry while_op_helper recurrent_op_helper conditional_block_op_helper)
 if(WITH_DISTRIBUTE)
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index b5ae120e33896..d1d146b2ce5f6 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,8 +1,8 @@
 # Compiling with WITH_PYTHON=ON and WITH_TENSORRT=ON failed on windows. Temporarily add paddle_inference_api dependency to solve the problem
 if(WIN32)
-nv_library(tensorrt_engine SRCS trt_int8_calibrator.cc engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost paddle_inference_api)
+nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost paddle_inference_api)
 else()
-nv_library(tensorrt_engine SRCS trt_int8_calibrator.cc engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost)
+nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost)
 endif()
 nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto device_context boost)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)

From 20ac825f5b7ccb58f2cd16d0ee330bb79d5b3edd Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Tue, 28 Dec 2021 08:14:45 +0000
Subject: [PATCH 10/12] fix tuned

---
 .../analysis/passes/ir_params_sync_among_devices_pass.cc       | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 7be781cb5e03a..8bb08b6fdaf2a 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -64,7 +64,8 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
                           argument->optim_input_shape().size() > 0);
   }
   with_dynamic_shape =
-      with_dynamic_shape || argument->tensorrt_tuned_dynamic_shape();
+      with_dynamic_shape || (argument->Has("tensorrt_tuned_dynamic_shape") &&
+                             argument->tensorrt_tuned_dynamic_shape());
   if (with_dynamic_shape) {
     reserve_cpu_weights = true;
   }

From 33153521d77b81276eea84de0b94dcb89937b1cc Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Wed, 29 Dec 2021 04:05:09 +0000
Subject: [PATCH 11/12] add ut

---
 paddle/fluid/inference/api/CMakeLists.txt     |  4 +-
 .../tests/api/trt_dynamic_shape_test.cc       | 82 +++++++++++++++++++
 2 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index f322a36a1f69c..53b92c1336302 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -65,10 +65,10 @@ if(WITH_TESTING)
 endif()
 
 if (NOT APPLE AND NOT WIN32)
-  cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc  DEPS paddle_inference_shared
+  cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS paddle_inference_shared
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
 elseif (WIN32)
-  cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc  DEPS analysis_predictor benchmark ${inference_deps}
+  cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
 endif()
 
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
index 4f6742b88b28c..11aee2a4a4749 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
@@ -207,6 +207,87 @@ void TestTunedDynamic() {
   check_func(test_predictor.get());
 }
 
+void TestDynamicClone(bool with_dynamic = true, bool delete_cache = true,
+                      bool delete_conv_bn = false) {
+  std::string model_dir =
+      FLAGS_infer_model + "/conv_bn_swish_split_gelu/conv_bn_swish_split_gelu";
+
+  std::string opt_cache_dir = model_dir + "/my_cache";
+  if (delete_cache) {
+    delete_cache_files(opt_cache_dir);
+  }
+
+  AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  std::string buffer_prog, buffer_param;
+  ReadBinaryFile(model_dir + "/model", &buffer_prog);
+  ReadBinaryFile(model_dir + "/params", &buffer_param);
+  config.SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0],
+                        buffer_param.size());
+  config.SetOptimCacheDir(opt_cache_dir);
+
+  config.SwitchUseFeedFetchOps(false);
+  // Set the input's min, max, opt shape
+  config.EnableTensorRtEngine(1 << 30, 1, 1,
+                              AnalysisConfig::Precision::kFloat32, true, true);
+  if (delete_conv_bn) {
+    config.pass_builder()->DeletePass("conv_bn_fuse_pass");
+  }
+  if (with_dynamic) {
+    std::map<std::string, std::vector<int>> min_input_shape = {
+        {"image", {1, 1, 3, 3}}};
+    std::map<std::string, std::vector<int>> max_input_shape = {
+        {"image", {1, 1, 10, 10}}};
+    std::map<std::string, std::vector<int>> opt_input_shape = {
+        {"image", {1, 1, 3, 3}}};
+
+    config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
+                                  opt_input_shape);
+  }
+  auto predictor = CreatePaddlePredictor(config);
+  auto input_names = predictor->GetInputNames();
+  int channels = 1;
+  int height = 3;
+  int width = 3;
+  int input_num = channels * height * width * 1;
+
+  float *input = new float[input_num];
+  memset(input, 0, input_num * sizeof(float));
+  auto input_t = predictor->GetInputTensor(input_names[0]);
+  input_t->Reshape({1, channels, height, width});
+  input_t->copy_from_cpu(input);
+
+  ASSERT_TRUE(predictor->ZeroCopyRun());
+
+  std::vector<float> out_data;
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputTensor(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  out_data.resize(out_num);
+  output_t->copy_to_cpu(out_data.data());
+
+  auto predictor2 = predictor->Clone();
+  auto input_t2 = predictor2->GetInputTensor(input_names[0]);
+  input_t2->Reshape({1, channels, height, width});
+  input_t2->copy_from_cpu(input);
+
+  ASSERT_TRUE(predictor2->ZeroCopyRun());
+
+  std::vector<float> out_data2;
+  auto output_t2 = predictor2->GetOutputTensor(output_names[0]);
+  std::vector<int> output_shape2 = output_t2->shape();
+  int out_num2 = std::accumulate(output_shape2.begin(), output_shape2.end(), 1,
+                                 std::multiplies<int>());
+  out_data2.resize(out_num2);
+  output_t2->copy_to_cpu(out_data2.data());
+  ASSERT_TRUE(out_data2.size() == out_data.size());
+  for (size_t i = 0; i < out_data.size(); i++) {
+    EXPECT_NEAR(out_data2[i], out_data[i], 1e-5);
+  }
+}
+
 TEST(AnalysisPredictor, trt_dynamic) { TestDynamic(true); }
 TEST(AnalysisPredictor, trt_static) { TestDynamic(false); }
 TEST(AnalysisPredictor, trt_memory_serialize) {
@@ -218,6 +299,7 @@ TEST(AnalysisPredictor, trt_memory_serialize) {
 TEST(AnalysisPredictor, trt_dynamic2) { TestDynamic2(); }
 
 TEST(AnalysisPredictor, trt_tuned_dynamic) { TestTunedDynamic(); }
+TEST(AnalysisPredictor, trt_dynamic_clone) { TestDynamicClone(); }
 
 }  // namespace inference
 }  // namespace paddle

From 3dfed217a7034ea458ebfc4f8b585f0178226581 Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Wed, 29 Dec 2021 05:06:13 +0000
Subject: [PATCH 12/12] false

---
 paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
index 11aee2a4a4749..ccdf237ffa54d 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
@@ -228,8 +228,8 @@ void TestDynamicClone(bool with_dynamic = true, bool delete_cache = true,
 
   config.SwitchUseFeedFetchOps(false);
   // Set the input's min, max, opt shape
-  config.EnableTensorRtEngine(1 << 30, 1, 1,
-                              AnalysisConfig::Precision::kFloat32, true, true);
+  config.EnableTensorRtEngine(
+      1 << 30, 1, 1, AnalysisConfig::Precision::kFloat32, false, false);
   if (delete_conv_bn) {
     config.pass_builder()->DeletePass("conv_bn_fuse_pass");
   }