PaddlePaddle · heavengate · Feb 23, 2023 · Mar 9, 2023 · Mar 9, 2023 · Mar 9, 2023
diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
@@ -483,24 +483,16 @@ void ResourceManager::DestroyGPUResource(void* stream) {
 }
 
 void ResourceManager::Decrease(void* stream) {
-  PADDLE_ENFORCE_EQ(ref_count_.count(stream),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "The stream[%p] not found in ref_count.", stream));
+  if (ref_count_.count(stream) == 0) return;
   --ref_count_[stream];
+
   if (ref_count_[stream] == 0) {
     ref_count_.erase(stream);
-    gpu_resources_.erase(stream);
+    if (gpu_resources_.count(stream) > 0) gpu_resources_.erase(stream);
   }
 }
 
-void ResourceManager::Increase(void* stream) {
-  PADDLE_ENFORCE_EQ(ref_count_.count(stream),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "The stream[%p] not found in ref_count.", stream));
-  ++ref_count_[stream];
-}
+void ResourceManager::Increase(void* stream) { ++ref_count_[stream]; }
 
 GPUContextResource* ResourceManager::GetGPUResource(void* stream) const {
   PADDLE_ENFORCE_EQ(gpu_resources_.count(stream),
@@ -512,31 +504,44 @@ GPUContextResource* ResourceManager::GetGPUResource(void* stream) const {
 
 void ResourceManager::GpuResourceReBindStream(void* old_stream,
                                               void* new_stream) {
+  // NOTE: add lock to support stream rebind in multi-thread
+  std::lock_guard<std::mutex> lock_gurad(gpu_mutex_);
+
   PADDLE_ENFORCE_EQ(
       gpu_resources_.count(old_stream),
       true,
       platform::errors::InvalidArgument(
           "The stream[%p] not found in gpu_resources.", old_stream));
-  auto gpu_resource = std::move(gpu_resources_.at(old_stream));
-  DestroyGPUResource(old_stream);
-  PADDLE_ENFORCE_EQ(
-      ref_count_.count(old_stream),
-      0,
-      platform::errors::Fatal("gpu resources rebind stream failed."));
-
-  gpu_resource->ReBindStream(static_cast<gpuStream_t>(new_stream));
-  gpu_resource->ReBindDnnHandle(static_cast<gpuStream_t>(new_stream));
-  gpu_resource->ReBindBlasHandle(static_cast<gpuStream_t>(new_stream));
-  gpu_resource->ReBindBlasTensorCoreHandle(
-      static_cast<gpuStream_t>(new_stream));
-  gpu_resource->ReBindBlasTF32Handle(static_cast<gpuStream_t>(new_stream));
-  gpu_resource->ReBindSolverDnHandle(static_cast<gpuStream_t>(new_stream));
-  gpu_resource->ReBindSparseHandle(static_cast<gpuStream_t>(new_stream));
-  gpu_resource->ReBindEigenDevice(static_cast<gpuStream_t>(new_stream),
-                                  gpu_resource->Place());
 
+  // NOTE: stream may be used by multiple predictor, skip resource
+  //       operation if resource of new_stream is already exists
+  bool new_stream_existed = gpu_resources_.count(new_stream) > 0;
+  if (!new_stream_existed) {
+    if (ref_count_[old_stream] == 1) {
+      // NOTE: if old_stream is ref_count is 1, old_stream is only
+      //       used by current predictor, rebind resource for new_stream
+      auto gpu_resource = std::move(gpu_resources_.at(old_stream));
+      gpu_resource->ReBindStream(static_cast<gpuStream_t>(new_stream));
+      gpu_resource->ReBindDnnHandle(static_cast<gpuStream_t>(new_stream));
+      gpu_resource->ReBindBlasHandle(static_cast<gpuStream_t>(new_stream));
+      gpu_resource->ReBindBlasTensorCoreHandle(
+          static_cast<gpuStream_t>(new_stream));
+      gpu_resource->ReBindBlasTF32Handle(static_cast<gpuStream_t>(new_stream));
+      gpu_resource->ReBindSolverDnHandle(static_cast<gpuStream_t>(new_stream));
+      gpu_resource->ReBindSparseHandle(static_cast<gpuStream_t>(new_stream));
+      gpu_resource->ReBindEigenDevice(static_cast<gpuStream_t>(new_stream),
+                                      gpu_resource->Place());
+      gpu_resources_.emplace(new_stream, std::move(gpu_resource));
+    } else {
+      auto place = gpu_resources_.at(old_stream)->Place();
+      std::unique_ptr<GPUContextResource> resource{
+          new GPUContextResource(place, new_stream)};
+      gpu_resources_.emplace(new_stream, std::move(resource));
+    }
+  }
+
+  Decrease(old_stream);
   ref_count_[new_stream]++;
-  gpu_resources_.emplace(new_stream, std::move(gpu_resource));
 }
 
 int ResourceManager::RefCount(void* stream) const {

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -981,6 +981,14 @@ if(WITH_GPU AND TENSORRT_FOUND)
     ${INFERENCE_C_EXTRA_DEPS}
     ARGS
     --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
+  inference_analysis_test(
+    trt_rebind_stream_test
+    SRCS
+    trt_rebind_stream_test.cc
+    EXTRA_DEPS
+    paddle_inference_shared
+    ARGS
+    --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
   if(WIN32)
     target_link_libraries(test_analyzer_capi_exp_gpu paddle_inference_c_shared)
   else()
@@ -1348,6 +1356,8 @@ if(WITH_GPU)
 endif()
 if(WITH_GPU AND TENSORRT_FOUND)
   set_tests_properties(trt_mobilenet_test PROPERTIES TIMEOUT 120)
+  set_tests_properties(trt_rebind_stream_test PROPERTIES LABELS
+                                                         "RUN_TYPE=EXCLUSIVE")
   if(WITH_MKLDNN)
     set_tests_properties(test_analyzer_bfloat16_resnet50 PROPERTIES TIMEOUT 120)
   endif()

diff --git a/paddle/fluid/inference/tests/api/trt_rebind_stream_test.cc b/paddle/fluid/inference/tests/api/trt_rebind_stream_test.cc
@@ -0,0 +1,100 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
+
+namespace paddle {
+namespace inference {
+
+TEST(ReBindStream_single, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir);
+  config.EnableTensorRtEngine();
+
+  cudaStream_t stream1, stream2, stream3;
+  cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
+  cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
+  cudaStreamCreateWithFlags(&stream3, cudaStreamNonBlocking);
+
+  config.SetExecStream(stream1);
+  auto predictor = paddle_infer::CreatePredictor(config);
+  auto x_t = predictor->GetInputHandle("x");
+  x_t->Reshape({1, 3, 224, 224});
+  float x_data[3 * 224 * 224] = {0};
+  x_t->CopyFromCpu(x_data);
+  ASSERT_TRUE(predictor->Run());
+  ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream(
+      predictor.get(), stream2));
+  ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream(
+      predictor.get(), stream3));
+}
+
+TEST(ReBindStream_multi, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  AnalysisConfig config1;
+  config1.EnableUseGpu(100, 0);
+  config1.SetModel(model_dir);
+  config1.EnableTensorRtEngine();
+  AnalysisConfig config2;
+  config2.EnableUseGpu(100, 0);
+  config2.EnableTensorRtEngine();
+  config2.SetModel(model_dir);
+
+  cudaStream_t stream1, stream2, stream3;
+  cudaStreamCreate(&stream1);
+  cudaStreamCreate(&stream2);
+  cudaStreamCreate(&stream3);
+
+  config1.SetExecStream(stream1);
+  config2.SetExecStream(stream1);
+  auto predictor1 = paddle_infer::CreatePredictor(config1);
+  auto predictor2 = paddle_infer::CreatePredictor(config2);
+
+  std::vector<float> x1(3 * 224 * 224, 1.0);
+  auto x_t1 = predictor1->GetInputHandle("x");
+  x_t1->Reshape({1, 3, 224, 224});
+  x_t1->CopyFromCpu(x1.data());
+  std::vector<float> x2(3 * 224 * 224, 2.0);
+  auto x_t2 = predictor2->GetInputHandle("x");
+  x_t2->Reshape({1, 3, 224, 224});
+  x_t2->CopyFromCpu(x2.data());
+
+  ASSERT_TRUE(predictor1->Run());
+  cudaStreamSynchronize(stream1);
+  ASSERT_TRUE(predictor2->Run());
+  cudaStreamSynchronize(stream1);
+
+  ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream(
+      predictor1.get(), stream2));
+  cudaDeviceSynchronize();
+  ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream(
+      predictor2.get(), stream2));
+  cudaDeviceSynchronize();
+
+  ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream(
+      predictor1.get(), stream3));
+  cudaStreamSynchronize(stream3);
+  ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream(
+      predictor2.get(), stream3));
+  cudaStreamSynchronize(stream3);
+}
+
+}  // namespace inference
+}  // namespace paddle