Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add copy from tensor #34406

Merged
merged 29 commits into from
Aug 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
0e568d3
add api
shangzhizhou Jul 26, 2021
24f6ae2
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
shangzhizhou Jul 26, 2021
80ce681
temp save
shangzhizhou Aug 12, 2021
107ce3c
fix conflict
shangzhizhou Aug 12, 2021
1fe24a9
revert
shangzhizhou Aug 12, 2021
5dbcc80
copytocpu async ok
shangzhizhou Aug 12, 2021
37deaab
fix style
shangzhizhou Aug 13, 2021
656abda
copy sync ok
shangzhizhou Aug 17, 2021
de42de4
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
shangzhizhou Aug 17, 2021
1cb2f58
fix compile error
shangzhizhou Aug 17, 2021
5967b02
fix compile error
shangzhizhou Aug 17, 2021
718429e
api done
shangzhizhou Aug 19, 2021
1a20251
update python async api
shangzhizhou Aug 19, 2021
94a7798
fix compile
shangzhizhou Aug 19, 2021
40334ab
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
shangzhizhou Aug 19, 2021
293fa53
remove async python api; add c++ async unittest
shangzhizhou Aug 20, 2021
5b18916
remove python async api
shangzhizhou Aug 20, 2021
8db5a62
update unittest
shangzhizhou Aug 23, 2021
01917d8
update unittest
shangzhizhou Aug 23, 2021
c4b0956
add C++ unittest for copytensor
shangzhizhou Aug 23, 2021
e566a52
add unittest
shangzhizhou Aug 24, 2021
23890de
update namespace utils to class TensorUtils
shangzhizhou Aug 24, 2021
743cb76
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
shangzhizhou Aug 24, 2021
6334896
add unittest
shangzhizhou Aug 24, 2021
be93486
update unittest
shangzhizhou Aug 24, 2021
3f4a40d
update unittest
shangzhizhou Aug 25, 2021
35222a4
update code style
shangzhizhou Aug 25, 2021
ae58c09
update code style
shangzhizhou Aug 25, 2021
c1707d0
update unittest
shangzhizhou Aug 25, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cmake/configure.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ if(WITH_TESTING)
add_definitions(-DPADDLE_WITH_TESTING)
endif(WITH_TESTING)

if(WITH_INFERENCE_API_TEST)
add_definitions(-DPADDLE_WITH_INFERENCE_API_TEST)
endif(WITH_INFERENCE_API_TEST)

if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)
Expand Down
5 changes: 3 additions & 2 deletions paddle/fluid/inference/api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,15 @@ if(WITH_MKLDNN)
endif()

cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer)
cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor)
cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)

if(WITH_CRYPTO)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array
analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
else()
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array
analysis_config zero_copy_tensor trainer_desc_proto custom_operator)
analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator)
endif()

if(WIN32)
Expand Down
73 changes: 65 additions & 8 deletions paddle/fluid/inference/api/details/zero_copy_tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ DataType Tensor::type() const {
return DataType::FLOAT32;
}

PlaceType Tensor::place() const { return place_; }

template <typename T>
void Tensor::CopyFromCpu(const T *data) {
EAGER_GET_TENSOR;
Expand Down Expand Up @@ -185,7 +187,8 @@ void Tensor::CopyFromCpu(const T *data) {
}

template <typename T>
void Tensor::CopyToCpu(T *data) {
void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
void *cb_params) const {
EAGER_GET_TENSOR;
auto ele_num = tensor->numel();
auto *t_data = tensor->data<T>();
Expand Down Expand Up @@ -222,7 +225,16 @@ void Tensor::CopyToCpu(T *data) {
#ifdef PADDLE_WITH_HIP
hipStreamSynchronize(dev_ctx->stream());
#else
cudaStreamSynchronize(dev_ctx->stream());
// async, return stream
if (nullptr != exec_stream) {
*(static_cast<cudaStream_t *>(exec_stream)) = dev_ctx->stream();
// async with callback
} else if (cb) {
cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
// sync
} else {
cudaStreamSynchronize(dev_ctx->stream());
}
#endif
#else
PADDLE_THROW(paddle::platform::errors::Unavailable(
Expand Down Expand Up @@ -261,19 +273,61 @@ void Tensor::CopyToCpu(T *data) {
"The analysis predictor supports CPU, GPU, NPU and XPU now."));
}
}

template <typename T>
void Tensor::CopyToCpu(T *data) const {
CopyToCpuImpl<T>(data, nullptr, nullptr, nullptr);
}

template <typename T>
void Tensor::CopyToCpuAsync(T *data, void *exec_stream) const {
CopyToCpuImpl<T>(data, exec_stream, nullptr, nullptr);
}

template <typename T>
void Tensor::CopyToCpuAsync(T *data, CallbackFunc cb, void *cb_params) const {
CopyToCpuImpl<T>(data, nullptr, cb, cb_params);
}

template PD_INFER_DECL void Tensor::CopyFromCpu<float>(const float *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<int64_t>(const int64_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<int32_t>(const int32_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<uint8_t>(const uint8_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<int8_t>(const int8_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<float16>(const float16 *data);

template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data);
template PD_INFER_DECL void Tensor::CopyToCpu<int64_t>(int64_t *data);
template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data);
template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data);
template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data);
template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data);
template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<int64_t>(int64_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data) const;

template PD_INFER_DECL void Tensor::CopyToCpuAsync<float>(
float *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int64_t>(
int64_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int32_t>(
int32_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<uint8_t>(
uint8_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
int8_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
float16 *data, void *exec_stream) const;

template PD_INFER_DECL void Tensor::CopyToCpuAsync<float>(
float *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int64_t>(
int64_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int32_t>(
int32_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<uint8_t>(
uint8_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
int8_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
float16 *data, CallbackFunc cb, void *cb_params) const;

template PD_INFER_DECL float *Tensor::data<float>(PlaceType *place,
int *size) const;
Expand All @@ -285,12 +339,15 @@ template PD_INFER_DECL uint8_t *Tensor::data<uint8_t>(PlaceType *place,
int *size) const;
template PD_INFER_DECL int8_t *Tensor::data<int8_t>(PlaceType *place,
int *size) const;
template PD_INFER_DECL float16 *Tensor::data<float16>(PlaceType *place,
int *size) const;

template PD_INFER_DECL float *Tensor::mutable_data<float>(PlaceType place);
template PD_INFER_DECL int64_t *Tensor::mutable_data<int64_t>(PlaceType place);
template PD_INFER_DECL int32_t *Tensor::mutable_data<int32_t>(PlaceType place);
template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);

Tensor::Tensor(void *scope) : scope_{scope} {
PADDLE_ENFORCE_NOT_NULL(scope_,
Expand Down
190 changes: 190 additions & 0 deletions paddle/fluid/inference/api/paddle_infer_contrib.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/inference/api/paddle_infer_contrib.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"

namespace paddle_infer {
namespace contrib {

using paddle::PaddleDType;

void* TensorUtils::CudaMallocPinnedMemory(size_t size) {
#if defined(PADDLE_WITH_CUDA)
void* ptr = nullptr;
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMallocHost(&ptr, size));
return ptr;
#else
return nullptr;
#endif
}

void TensorUtils::CudaFreePinnedMemory(void* ptr) {
#if defined(PADDLE_WITH_CUDA)
PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(ptr));
#endif
}

void TensorUtils::CopyTensorImpl(Tensor* p_dst, const Tensor& src,
void* exec_stream, CallbackFunc cb,
void* cb_params) {
Tensor& dst = *p_dst;
dst.Reshape(src.shape());
PADDLE_ENFORCE(
src.place() == PlaceType::kCPU || src.place() == PlaceType::kGPU,
paddle::platform::errors::InvalidArgument(
"CopyTensor only support PlaceType kCPU/kGPU now."));
PADDLE_ENFORCE(
dst.place() == PlaceType::kCPU || dst.place() == PlaceType::kGPU,
paddle::platform::errors::InvalidArgument(
"CopyTensor only support PlaceType kCPU/kGPU now."));
// copy to cpu, gpu => cpu or cpu => cpu
if (dst.place() == PlaceType::kCPU) {
switch (src.type()) {
case PaddleDType::INT32:
src.CopyToCpuImpl(dst.mutable_data<int32_t>(PlaceType::kCPU),
exec_stream, cb, cb_params);
break;
case PaddleDType::INT64:
src.CopyToCpuImpl(dst.mutable_data<int64_t>(PlaceType::kCPU),
exec_stream, cb, cb_params);
break;
case PaddleDType::FLOAT32:
src.CopyToCpuImpl(dst.mutable_data<float>(PlaceType::kCPU), exec_stream,
cb, cb_params);
break;
case PaddleDType::UINT8:
src.CopyToCpuImpl(dst.mutable_data<uint8_t>(PlaceType::kCPU),
exec_stream, cb, cb_params);
break;
case PaddleDType::INT8:
src.CopyToCpuImpl(dst.mutable_data<int8_t>(PlaceType::kCPU),
exec_stream, cb, cb_params);
break;
case PaddleDType::FLOAT16:
src.CopyToCpuImpl(
dst.mutable_data<paddle::platform::float16>(PlaceType::kCPU),
exec_stream, cb, cb_params);
break;
default:
PADDLE_THROW(paddle::platform::errors::Unimplemented(
"Only INT32, INT64, UINT8, INT8, FLOAT16 and "
"FLOAT32 is supported in Tensor. Others not implements"));
}
// gpu => gpu or cpu => gpu
} else {
#if defined(PADDLE_WITH_CUDA)
void* dst_data = nullptr;
void* src_data = nullptr;
size_t data_len = 0;
int data_size = 0;
PlaceType src_place;
switch (src.type()) {
case PaddleDType::INT32:
dst_data =
static_cast<void*>(dst.mutable_data<int32_t>(PlaceType::kGPU));
src_data =
static_cast<void*>(src.data<int32_t>(&src_place, &data_size));
data_len = data_size * sizeof(int32_t);
break;
case PaddleDType::INT64:
dst_data =
static_cast<void*>(dst.mutable_data<int64_t>(PlaceType::kGPU));
src_data =
static_cast<void*>(src.data<int64_t>(&src_place, &data_size));
data_len = data_size * sizeof(int64_t);
break;
case PaddleDType::FLOAT32:
dst_data = static_cast<void*>(dst.mutable_data<float>(PlaceType::kGPU));
src_data = static_cast<void*>(src.data<float>(&src_place, &data_size));
data_len = data_size * sizeof(float);
break;
case PaddleDType::UINT8:
dst_data =
static_cast<void*>(dst.mutable_data<uint8_t>(PlaceType::kGPU));
src_data =
static_cast<void*>(src.data<uint8_t>(&src_place, &data_size));
data_len = data_size * sizeof(uint8_t);
break;
case PaddleDType::INT8:
dst_data =
static_cast<void*>(dst.mutable_data<int8_t>(PlaceType::kGPU));
src_data = static_cast<void*>(src.data<int8_t>(&src_place, &data_size));
data_len = data_size * sizeof(int8_t);
break;
case PaddleDType::FLOAT16:
dst_data = static_cast<void*>(
dst.mutable_data<paddle::platform::float16>(PlaceType::kGPU));
src_data = static_cast<void*>(
src.data<paddle::platform::float16>(&src_place, &data_size));
data_len = data_size * 2;
break;
default:
PADDLE_THROW(paddle::platform::errors::Unimplemented(
"Only INT32, INT64, UINT8, INT8, FLOAT16 and "
"FLOAT32 is supported in Tensor. Others not implements"));
}

paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
paddle::platform::CUDAPlace gpu_place(dst.device_);
auto* dev_ctx = static_cast<const paddle::platform::CUDADeviceContext*>(
pool.Get(gpu_place));

if (src.place() == PlaceType::kCPU) {
paddle::memory::Copy(gpu_place, static_cast<void*>(dst_data),
paddle::platform::CPUPlace(), src_data, data_len,
dev_ctx->stream());
} else {
paddle::memory::Copy(gpu_place, static_cast<void*>(dst_data),
paddle::platform::CUDAPlace(), src_data, data_len,
dev_ctx->stream());
}

if (nullptr != exec_stream) {
*(static_cast<cudaStream_t*>(exec_stream)) = dev_ctx->stream();
} else if (cb) {
cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
} else {
cudaStreamSynchronize(dev_ctx->stream());
}
#else
PADDLE_THROW(paddle::platform::errors::Unavailable(
"Can not copy tensor to GPU CUDA place because paddle is not compiled "
"with CUDA."));
#endif
}
return;
}

void TensorUtils::CopyTensor(Tensor* p_dst, const Tensor& src) {
CopyTensorImpl(p_dst, src, nullptr, nullptr, nullptr);
}

void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src,
void* exec_stream) {
CopyTensorImpl(p_dst, src, exec_stream, nullptr, nullptr);
}

void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src,
CallbackFunc cb, void* cb_params) {
CopyTensorImpl(p_dst, src, nullptr, cb, cb_params);
}

} // namespace contrib
} // namespace paddle_infer
40 changes: 40 additions & 0 deletions paddle/fluid/inference/api/paddle_infer_contrib.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "paddle/fluid/inference/api/paddle_inference_api.h"

namespace paddle_infer {
namespace contrib {

class TensorUtils {
public:
static void* CudaMallocPinnedMemory(size_t size);
static void CudaFreePinnedMemory(void* mem);

static void CopyTensor(Tensor* p_dst, const Tensor& src);
static void CopyTensorAsync(Tensor* p_dst, const Tensor& src,
void* exec_stream);
static void CopyTensorAsync(Tensor* p_dst, const Tensor& src, CallbackFunc cb,
void* cb_params);

private:
static void CopyTensorImpl(Tensor* p_dst, const Tensor& src,
void* exec_stream, CallbackFunc cb,
void* cb_params);
};

} // namespace contrib
} // namespace paddle_infer
Loading