Revert "[inference][trt]add trt sparse weights switch (PaddlePaddle#5…

…3562)" This reverts commit 4a69a53.
zhangjun · May 17, 2023 · 888b409 · 888b409
1 parent 2cb2801
commit 888b409
Show file tree

Hide file tree

Showing 11 changed files with 16 additions and 51 deletions.
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
@@ -250,9 +250,6 @@ struct Argument {
                       TensorRtAllowBuildAtRuntime,
                       bool);
   DECL_ARGUMENT_FIELD(tensorrt_use_inspector, TensorRtUseInspector, bool);
-  DECL_ARGUMENT_FIELD(tensorrt_use_sparse_weights,
-                      TensorRtUseSparseWeights,
-                      bool);
 
   DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
   DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);

diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -213,8 +213,6 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("use_static_engine", new bool(use_static_engine));
       pass->Set("model_from_memory", new bool(argument->model_from_memory()));
       pass->Set("use_inspector", new bool(argument->tensorrt_use_inspector()));
-      pass->Set("use_sparse_weights",
-                new bool(argument->tensorrt_use_sparse_weights()));
 
       // tuned trt dynamic_shape
       pass->Set("trt_shape_range_info_path",

diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -524,7 +524,6 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
   op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime);
   op_desc->SetAttr("shape_range_info_path", shape_range_info_path);
   op_desc->SetAttr("use_inspector", Get<bool>("use_inspector"));
-  op_desc->SetAttr("use_sparse_weights", Get<bool>("use_sparse_weights"));
   op_desc->SetAttr("model_precision", Get<int>("model_precision"));
   op_desc->SetAttr("with_dynamic_shape", with_dynamic_shape);
 
@@ -616,14 +615,17 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
     opt_input_shape = {};
   }
 
-  const float trt_compile_version = tensorrt::TrtMajorVersion(TRT_VERSION);
-  const float trt_runtime_version =
-      tensorrt::TrtMajorVersion(tensorrt::GetInferLibVersion());
-  if (trt_compile_version != trt_runtime_version) {
+  auto to_major_version = [&](int full_version) -> float {
+    return (full_version / 100) / 10.0;
+  };
+  const float compile_time_trt_version = to_major_version(TRT_VERSION);
+  const float run_time_trt_version =
+      to_major_version(tensorrt::GetInferLibVersion());
+  if (compile_time_trt_version != run_time_trt_version) {
     LOG_FIRST_N(WARNING, 1)
         << "The Paddle Inference library is compiled with "
-        << trt_compile_version << " version TensorRT, "
-        << "but the runtime TensorRT you are using is " << trt_runtime_version
+        << compile_time_trt_version << " version TensorRT, "
+        << "but the runtime TensorRT you are using is " << run_time_trt_version
         << " version. "
            "This might cause serious compatibility issues. We strongly "
            "recommend using the same TRT version at runtime.";
@@ -665,7 +667,6 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
   trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
   trt_engine->SetDLACore(Get<int>("trt_dla_core"));
   trt_engine->SetUseInspector(Get<bool>("use_inspector"));
-  trt_engine->SetUseSparseWeights(Get<bool>("use_sparse_weights"));
   trt_engine->SetWithErnie(
       graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
       graph->Has(framework::ir::kMultiheadMatmulPass));

diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
@@ -451,7 +451,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(collect_shape_range_info_);
   CP_MEMBER(shape_range_info_path_);
   CP_MEMBER(trt_use_inspector_);
-  CP_MEMBER(trt_use_sparse_weights_);
   CP_MEMBER(trt_engine_memory_sharing_);
   CP_MEMBER(trt_engine_memory_sharing_identifier_);
   // Dlnne related
@@ -806,10 +805,6 @@ void AnalysisConfig::EnableTensorRtDLA(int dla_core) {
 
 void AnalysisConfig::EnableTensorRtInspector() { trt_use_inspector_ = true; }
 
-void AnalysisConfig::EnableTensorRtSparseWeights() {
-  trt_use_sparse_weights_ = true;
-}
-
 void AnalysisConfig::Exp_DisableTensorRtOPs(
     const std::vector<std::string> &ops) {
   trt_disabled_ops_.insert(trt_disabled_ops_.end(), ops.begin(), ops.end());

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1397,7 +1397,6 @@ void AnalysisPredictor::PrepareArgument() {
     argument_->SetTensorRtAllowBuildAtRuntime(
         config_.trt_allow_build_at_runtime());
     argument_->SetTensorRtUseInspector(config_.trt_use_inspector_);
-    argument_->SetTensorRtUseSparseWeights(config_.trt_use_sparse_weights_);
     argument_->SetTrtEngineMemorySharing(config_.trt_engine_memory_sharing());
   }
 

diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -742,9 +742,6 @@ struct PD_INFER_DECL AnalysisConfig {
   void EnableTensorRtInspector();
   bool tensorrt_inspector_enabled() { return trt_use_inspector_; }
 
-  void EnableTensorRtSparseWeights();
-  bool tensorrt_sparse_weights_enabled() { return trt_use_sparse_weights_; }
-
   void EnableDlnne(
       int min_subgraph_size = 3,
       int max_batch_size = 1,
@@ -1121,7 +1118,6 @@ struct PD_INFER_DECL AnalysisConfig {
   // tune to get dynamic_shape info.
   bool trt_tuned_dynamic_shape_{false};
   bool trt_use_inspector_{false};
-  bool trt_use_sparse_weights_{false};
 
   // In CollectShapeInfo mode, we will collect the shape information of
   // all intermediate tensors in the compute graph and calculate the

diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
@@ -207,6 +207,12 @@ void TensorRTEngine::FreezeNetwork() {
   infer_builder_config_->setMaxWorkspaceSize(max_workspace_);
 #endif
 
+#if IS_TRT_VERSION_GE(8500)
+  infer_builder_config_->setPreviewFeature(
+      nvinfer1::PreviewFeature::kFASTER_DYNAMIC_SHAPES_0805, true);
+#else
+#endif
+
   bool enable_fp16 = (precision_ == AnalysisConfig::Precision::kHalf);
   if (enable_fp16) {
     bool support_fp16 = infer_builder_->platformHasFastFp16();
@@ -357,7 +363,6 @@ void TensorRTEngine::FreezeNetwork() {
                    "opt_shape, false /*disable_trt_plugin_fp16*/)'";
     }
   }
-
 #if IS_TRT_VERSION_GE(8200)
   if (use_inspector_) {
     infer_builder_config_->setProfilingVerbosity(
@@ -369,9 +374,7 @@ void TensorRTEngine::FreezeNetwork() {
   infer_engine_.reset(infer_builder_->buildEngineWithConfig(
       *network(), *infer_builder_config_));
 #else
-  if (use_sparse_weights_) {
-    infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
-  }
+  infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
   ihost_memory_.reset(infer_builder_->buildSerializedNetwork(
       *network(), *infer_builder_config_));
   infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));

diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
@@ -738,9 +738,6 @@ class TensorRTEngine {
   void GetEngineInfo();
 
   void SetUseInspector(bool use_inspector) { use_inspector_ = use_inspector; }
-  void SetUseSparseWeights(bool use_sparse_weights) {
-    use_sparse_weights_ = use_sparse_weights;
-  }
   void SetScope(const framework::Scope& scope) { scope_ = &scope; }
 
   void SetContextMemorySharing(bool context_memory_sharing) {
@@ -829,7 +826,6 @@ class TensorRTEngine {
 #endif
   std::mutex mutex_;
   bool use_inspector_;
-  bool use_sparse_weights_{false};
 
  public:
   thread_local static int predictor_id_per_thread;

diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
@@ -96,10 +96,6 @@ static std::tuple<int, int, int> GetTrtCompileVersion() {
       NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH};
 }
 
-static float TrtMajorVersion(int full_version) {
-  return (full_version / 100) / 10.0;
-}
-
 template <typename T>
 struct Destroyer {
   void operator()(T* x) {

diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
@@ -879,10 +879,6 @@ void BindAnalysisConfig(py::module *m) {
            &AnalysisConfig::EnableTensorRtInspector)
       .def("tensorrt_inspector_enabled",
            &AnalysisConfig::tensorrt_inspector_enabled)
-      .def("enable_tensorrt_sparse_weights",
-           &AnalysisConfig::EnableTensorRtSparseWeights)
-      .def("tensorrt_sparse_weights_enabled",
-           &AnalysisConfig::tensorrt_sparse_weights_enabled)
       .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
       .def("enable_dlnne",
            &AnalysisConfig::EnableDlnne,

diff --git a/test/ir/inference/test_trt_inference_predictor.py b/test/ir/inference/test_trt_inference_predictor.py
@@ -84,8 +84,6 @@ def load(self, config_arg, inputs=None, outpus=None):
         # enable memory optim
         if not self.args.enable_tune:
             config.enable_memory_optim()
-        if self.args.enable_trt_sparse_weights:
-            config.enable_tensorrt_sparse_weights()
 
         config.set_cpu_math_library_num_threads(self.args.cpu_threads)
         config.switch_ir_optim(True)
@@ -260,9 +258,6 @@ def parse_args():
     parser.add_argument('--enable_dynamic_shape', type=str2bool, default=True)
     parser.add_argument('--enable_tune', type=str2bool, default=False)
     parser.add_argument('--enable_profile', type=str2bool, default=False)
-    parser.add_argument(
-        '--enable_trt_sparse_weights', type=str2bool, default=False
-    )
     parser.add_argument('--enable_benchmark', type=str2bool, default=True)
     parser.add_argument('--save_result', type=str2bool, default=False)
     parser.add_argument('--return_result', type=str2bool, default=False)
@@ -313,13 +308,6 @@ def run_infer(model_path):
     backend.load(conf)
     backend.predict()
 
-    # run inference predictor, enable trt sparse weights
-    conf.enable_tune = False
-    conf.enable_trt_sparse_weights = True
-    backend = BackendPaddle()
-    backend.load(conf)
-    backend.predict()
-
 
 class ConvBNLayer(paddle.nn.Layer):
     def __init__(