Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update stream rebind #51412

Closed
65 changes: 35 additions & 30 deletions paddle/fluid/inference/api/resource_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -483,24 +483,16 @@ void ResourceManager::DestroyGPUResource(void* stream) {
}

void ResourceManager::Decrease(void* stream) {
PADDLE_ENFORCE_EQ(ref_count_.count(stream),
true,
platform::errors::InvalidArgument(
"The stream[%p] not found in ref_count.", stream));
if (ref_count_.count(stream) == 0) return;
--ref_count_[stream];

if (ref_count_[stream] == 0) {
ref_count_.erase(stream);
gpu_resources_.erase(stream);
if (gpu_resources_.count(stream) > 0) gpu_resources_.erase(stream);
}
}

void ResourceManager::Increase(void* stream) {
PADDLE_ENFORCE_EQ(ref_count_.count(stream),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rebind加锁后,这里的Check还是要去掉吗?

true,
platform::errors::InvalidArgument(
"The stream[%p] not found in ref_count.", stream));
++ref_count_[stream];
}
void ResourceManager::Increase(void* stream) { ++ref_count_[stream]; }

GPUContextResource* ResourceManager::GetGPUResource(void* stream) const {
PADDLE_ENFORCE_EQ(gpu_resources_.count(stream),
Expand All @@ -512,31 +504,44 @@ GPUContextResource* ResourceManager::GetGPUResource(void* stream) const {

void ResourceManager::GpuResourceReBindStream(void* old_stream,
void* new_stream) {
// NOTE: add lock to support stream rebind in multi-thread
std::lock_guard<std::mutex> lock_gurad(gpu_mutex_);

PADDLE_ENFORCE_EQ(
gpu_resources_.count(old_stream),
true,
platform::errors::InvalidArgument(
"The stream[%p] not found in gpu_resources.", old_stream));
auto gpu_resource = std::move(gpu_resources_.at(old_stream));
DestroyGPUResource(old_stream);
PADDLE_ENFORCE_EQ(
ref_count_.count(old_stream),
0,
platform::errors::Fatal("gpu resources rebind stream failed."));

gpu_resource->ReBindStream(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindDnnHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindBlasHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindBlasTensorCoreHandle(
static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindBlasTF32Handle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindSolverDnHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindSparseHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindEigenDevice(static_cast<gpuStream_t>(new_stream),
gpu_resource->Place());

// NOTE: stream may be used by multiple predictor, skip resource
// operation if resource of new_stream is already exists
bool new_stream_existed = gpu_resources_.count(new_stream) > 0;
if (!new_stream_existed) {
if (ref_count_[old_stream] == 1) {
// NOTE: if old_stream is ref_count is 1, old_stream is only
// used by current predictor, rebind resource for new_stream
auto gpu_resource = std::move(gpu_resources_.at(old_stream));
gpu_resource->ReBindStream(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindDnnHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindBlasHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindBlasTensorCoreHandle(
static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindBlasTF32Handle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindSolverDnHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindSparseHandle(static_cast<gpuStream_t>(new_stream));
gpu_resource->ReBindEigenDevice(static_cast<gpuStream_t>(new_stream),
gpu_resource->Place());
gpu_resources_.emplace(new_stream, std::move(gpu_resource));
} else {
auto place = gpu_resources_.at(old_stream)->Place();
std::unique_ptr<GPUContextResource> resource{
new GPUContextResource(place, new_stream)};
gpu_resources_.emplace(new_stream, std::move(resource));
}
}

Decrease(old_stream);
ref_count_[new_stream]++;
gpu_resources_.emplace(new_stream, std::move(gpu_resource));
}

int ResourceManager::RefCount(void* stream) const {
Expand Down
10 changes: 10 additions & 0 deletions paddle/fluid/inference/tests/api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -981,6 +981,14 @@ if(WITH_GPU AND TENSORRT_FOUND)
${INFERENCE_C_EXTRA_DEPS}
ARGS
--infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
inference_analysis_test(
trt_rebind_stream_test
SRCS
trt_rebind_stream_test.cc
EXTRA_DEPS
paddle_inference_shared
ARGS
--infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
if(WIN32)
target_link_libraries(test_analyzer_capi_exp_gpu paddle_inference_c_shared)
else()
Expand Down Expand Up @@ -1348,6 +1356,8 @@ if(WITH_GPU)
endif()
if(WITH_GPU AND TENSORRT_FOUND)
set_tests_properties(trt_mobilenet_test PROPERTIES TIMEOUT 120)
set_tests_properties(trt_rebind_stream_test PROPERTIES LABELS
"RUN_TYPE=EXCLUSIVE")
if(WITH_MKLDNN)
set_tests_properties(test_analyzer_bfloat16_resnet50 PROPERTIES TIMEOUT 120)
endif()
Expand Down
100 changes: 100 additions & 0 deletions paddle/fluid/inference/tests/api/trt_rebind_stream_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include <glog/logging.h>
#include <gtest/gtest.h>

#include "gflags/gflags.h"
#include "paddle/fluid/inference/tests/api/trt_test_helper.h"

namespace paddle {
namespace inference {

TEST(ReBindStream_single, use_gpu) {
std::string model_dir = FLAGS_infer_model + "/mobilenet";
AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.SetModel(model_dir);
config.EnableTensorRtEngine();

cudaStream_t stream1, stream2, stream3;
cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&stream3, cudaStreamNonBlocking);

config.SetExecStream(stream1);
auto predictor = paddle_infer::CreatePredictor(config);
auto x_t = predictor->GetInputHandle("x");
x_t->Reshape({1, 3, 224, 224});
float x_data[3 * 224 * 224] = {0};
x_t->CopyFromCpu(x_data);
ASSERT_TRUE(predictor->Run());
ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream(
predictor.get(), stream2));
ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream(
predictor.get(), stream3));
}

TEST(ReBindStream_multi, use_gpu) {
std::string model_dir = FLAGS_infer_model + "/mobilenet";
AnalysisConfig config1;
config1.EnableUseGpu(100, 0);
config1.SetModel(model_dir);
config1.EnableTensorRtEngine();
AnalysisConfig config2;
config2.EnableUseGpu(100, 0);
config2.EnableTensorRtEngine();
config2.SetModel(model_dir);

cudaStream_t stream1, stream2, stream3;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaStreamCreate(&stream3);

config1.SetExecStream(stream1);
config2.SetExecStream(stream1);
auto predictor1 = paddle_infer::CreatePredictor(config1);
auto predictor2 = paddle_infer::CreatePredictor(config2);

std::vector<float> x1(3 * 224 * 224, 1.0);
auto x_t1 = predictor1->GetInputHandle("x");
x_t1->Reshape({1, 3, 224, 224});
x_t1->CopyFromCpu(x1.data());
std::vector<float> x2(3 * 224 * 224, 2.0);
auto x_t2 = predictor2->GetInputHandle("x");
x_t2->Reshape({1, 3, 224, 224});
x_t2->CopyFromCpu(x2.data());

ASSERT_TRUE(predictor1->Run());
cudaStreamSynchronize(stream1);
ASSERT_TRUE(predictor2->Run());
cudaStreamSynchronize(stream1);

ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream(
predictor1.get(), stream2));
cudaDeviceSynchronize();
ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream(
predictor2.get(), stream2));
cudaDeviceSynchronize();

ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream(
predictor1.get(), stream3));
cudaStreamSynchronize(stream3);
ASSERT_TRUE(paddle_infer::experimental::InternalUtils::RunWithExternalStream(
predictor2.get(), stream3));
cudaStreamSynchronize(stream3);
}

} // namespace inference
} // namespace paddle