diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc index 49d33d6750abe..5dbd6aae233b0 100644 --- a/paddle/fluid/inference/api/resource_manager.cc +++ b/paddle/fluid/inference/api/resource_manager.cc @@ -483,24 +483,16 @@ void ResourceManager::DestroyGPUResource(void* stream) { } void ResourceManager::Decrease(void* stream) { - PADDLE_ENFORCE_EQ(ref_count_.count(stream), - true, - platform::errors::InvalidArgument( - "The stream[%p] not found in ref_count.", stream)); + if (ref_count_.count(stream) == 0) return; --ref_count_[stream]; + if (ref_count_[stream] == 0) { ref_count_.erase(stream); - gpu_resources_.erase(stream); + if (gpu_resources_.count(stream) > 0) gpu_resources_.erase(stream); } } -void ResourceManager::Increase(void* stream) { - PADDLE_ENFORCE_EQ(ref_count_.count(stream), - true, - platform::errors::InvalidArgument( - "The stream[%p] not found in ref_count.", stream)); - ++ref_count_[stream]; -} +void ResourceManager::Increase(void* stream) { ++ref_count_[stream]; } GPUContextResource* ResourceManager::GetGPUResource(void* stream) const { PADDLE_ENFORCE_EQ(gpu_resources_.count(stream), @@ -512,31 +504,44 @@ GPUContextResource* ResourceManager::GetGPUResource(void* stream) const { void ResourceManager::GpuResourceReBindStream(void* old_stream, void* new_stream) { + // NOTE: add lock to support stream rebind in multi-thread + std::lock_guard lock_gurad(gpu_mutex_); + PADDLE_ENFORCE_EQ( gpu_resources_.count(old_stream), true, platform::errors::InvalidArgument( "The stream[%p] not found in gpu_resources.", old_stream)); - auto gpu_resource = std::move(gpu_resources_.at(old_stream)); - DestroyGPUResource(old_stream); - PADDLE_ENFORCE_EQ( - ref_count_.count(old_stream), - 0, - platform::errors::Fatal("gpu resources rebind stream failed.")); - - gpu_resource->ReBindStream(static_cast(new_stream)); - gpu_resource->ReBindDnnHandle(static_cast(new_stream)); - gpu_resource->ReBindBlasHandle(static_cast(new_stream)); - gpu_resource->ReBindBlasTensorCoreHandle( - static_cast(new_stream)); - gpu_resource->ReBindBlasTF32Handle(static_cast(new_stream)); - gpu_resource->ReBindSolverDnHandle(static_cast(new_stream)); - gpu_resource->ReBindSparseHandle(static_cast(new_stream)); - gpu_resource->ReBindEigenDevice(static_cast(new_stream), - gpu_resource->Place()); + // NOTE: stream may be used by multiple predictor, skip resource + // operation if resource of new_stream is already exists + bool new_stream_existed = gpu_resources_.count(new_stream) > 0; + if (!new_stream_existed) { + if (ref_count_[old_stream] == 1) { + // NOTE: if old_stream is ref_count is 1, old_stream is only + // used by current predictor, rebind resource for new_stream + auto gpu_resource = std::move(gpu_resources_.at(old_stream)); + gpu_resource->ReBindStream(static_cast(new_stream)); + gpu_resource->ReBindDnnHandle(static_cast(new_stream)); + gpu_resource->ReBindBlasHandle(static_cast(new_stream)); + gpu_resource->ReBindBlasTensorCoreHandle( + static_cast(new_stream)); + gpu_resource->ReBindBlasTF32Handle(static_cast(new_stream)); + gpu_resource->ReBindSolverDnHandle(static_cast(new_stream)); + gpu_resource->ReBindSparseHandle(static_cast(new_stream)); + gpu_resource->ReBindEigenDevice(static_cast(new_stream), + gpu_resource->Place()); + gpu_resources_.emplace(new_stream, std::move(gpu_resource)); + } else { + auto place = gpu_resources_.at(old_stream)->Place(); + std::unique_ptr resource{ + new GPUContextResource(place, new_stream)}; + gpu_resources_.emplace(new_stream, std::move(resource)); + } + } + + Decrease(old_stream); ref_count_[new_stream]++; - gpu_resources_.emplace(new_stream, std::move(gpu_resource)); } int ResourceManager::RefCount(void* stream) const { diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 693f0582c5d52..b7ef6e49cd691 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -981,6 +981,14 @@ if(WITH_GPU AND TENSORRT_FOUND) ${INFERENCE_C_EXTRA_DEPS} ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + inference_analysis_test( + trt_rebind_stream_test + SRCS + trt_rebind_stream_test.cc + EXTRA_DEPS + paddle_inference_shared + ARGS + --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) if(WIN32) target_link_libraries(test_analyzer_capi_exp_gpu paddle_inference_c_shared) else() @@ -1348,6 +1356,8 @@ if(WITH_GPU) endif() if(WITH_GPU AND TENSORRT_FOUND) set_tests_properties(trt_mobilenet_test PROPERTIES TIMEOUT 120) + set_tests_properties(trt_rebind_stream_test PROPERTIES LABELS + "RUN_TYPE=EXCLUSIVE") if(WITH_MKLDNN) set_tests_properties(test_analyzer_bfloat16_resnet50 PROPERTIES TIMEOUT 120) endif() diff --git a/paddle/fluid/inference/tests/api/trt_rebind_stream_test.cc b/paddle/fluid/inference/tests/api/trt_rebind_stream_test.cc new file mode 100644 index 0000000000000..2b5e1bb1d54b4 --- /dev/null +++ b/paddle/fluid/inference/tests/api/trt_rebind_stream_test.cc @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/api/trt_test_helper.h" + +namespace paddle { +namespace inference { + +TEST(ReBindStream_single, use_gpu) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.SetModel(model_dir); + config.EnableTensorRtEngine(); + + cudaStream_t stream1, stream2, stream3; + cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking); + cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking); + cudaStreamCreateWithFlags(&stream3, cudaStreamNonBlocking); + + config.SetExecStream(stream1); + auto predictor = paddle_infer::CreatePredictor(config); + auto x_t = predictor->GetInputHandle("x"); + x_t->Reshape({1, 3, 224, 224}); + float x_data[3 * 224 * 224] = {0}; + x_t->CopyFromCpu(x_data); + ASSERT_TRUE(predictor->Run()); + ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream( + predictor.get(), stream2)); + ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream( + predictor.get(), stream3)); +} + +TEST(ReBindStream_multi, use_gpu) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + AnalysisConfig config1; + config1.EnableUseGpu(100, 0); + config1.SetModel(model_dir); + config1.EnableTensorRtEngine(); + AnalysisConfig config2; + config2.EnableUseGpu(100, 0); + config2.EnableTensorRtEngine(); + config2.SetModel(model_dir); + + cudaStream_t stream1, stream2, stream3; + cudaStreamCreate(&stream1); + cudaStreamCreate(&stream2); + cudaStreamCreate(&stream3); + + config1.SetExecStream(stream1); + config2.SetExecStream(stream1); + auto predictor1 = paddle_infer::CreatePredictor(config1); + auto predictor2 = paddle_infer::CreatePredictor(config2); + + std::vector x1(3 * 224 * 224, 1.0); + auto x_t1 = predictor1->GetInputHandle("x"); + x_t1->Reshape({1, 3, 224, 224}); + x_t1->CopyFromCpu(x1.data()); + std::vector x2(3 * 224 * 224, 2.0); + auto x_t2 = predictor2->GetInputHandle("x"); + x_t2->Reshape({1, 3, 224, 224}); + x_t2->CopyFromCpu(x2.data()); + + ASSERT_TRUE(predictor1->Run()); + cudaStreamSynchronize(stream1); + ASSERT_TRUE(predictor2->Run()); + cudaStreamSynchronize(stream1); + + ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream( + predictor1.get(), stream2)); + cudaDeviceSynchronize(); + ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream( + predictor2.get(), stream2)); + cudaDeviceSynchronize(); + + ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream( + predictor1.get(), stream3)); + cudaStreamSynchronize(stream3); + ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream( + predictor2.get(), stream3)); + cudaStreamSynchronize(stream3); +} + +} // namespace inference +} // namespace paddle