Skip to content

Commit

Permalink
Revert "Revert "Add copy from tensor (#34406)" (#35173)" (#35256)
Browse files Browse the repository at this point in the history
* Revert "Revert "Add copy from tensor (#34406)" (#35173)"

This reverts commit 32c1ec4.

* add template instantiation
  • Loading branch information
shangzhizhou authored Aug 31, 2021
1 parent 00c9aeb commit 6116f9a
Show file tree
Hide file tree
Showing 11 changed files with 725 additions and 11 deletions.
4 changes: 4 additions & 0 deletions cmake/configure.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ if(WITH_TESTING)
add_definitions(-DPADDLE_WITH_TESTING)
endif(WITH_TESTING)

if(WITH_INFERENCE_API_TEST)
add_definitions(-DPADDLE_WITH_INFERENCE_API_TEST)
endif(WITH_INFERENCE_API_TEST)

if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)
Expand Down
5 changes: 3 additions & 2 deletions paddle/fluid/inference/api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,15 @@ if(WITH_MKLDNN)
endif()

cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer)
cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor)
cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)

if(WITH_CRYPTO)
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array
analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
else()
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array
analysis_config zero_copy_tensor trainer_desc_proto custom_operator)
analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator)
endif()

if(WIN32)
Expand Down
88 changes: 80 additions & 8 deletions paddle/fluid/inference/api/details/zero_copy_tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ DataType Tensor::type() const {
return DataType::FLOAT32;
}

PlaceType Tensor::place() const { return place_; }

template <typename T>
void Tensor::CopyFromCpu(const T *data) {
EAGER_GET_TENSOR;
Expand Down Expand Up @@ -185,7 +187,8 @@ void Tensor::CopyFromCpu(const T *data) {
}

template <typename T>
void Tensor::CopyToCpu(T *data) {
void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
void *cb_params) const {
EAGER_GET_TENSOR;
auto ele_num = tensor->numel();
auto *t_data = tensor->data<T>();
Expand Down Expand Up @@ -222,7 +225,16 @@ void Tensor::CopyToCpu(T *data) {
#ifdef PADDLE_WITH_HIP
hipStreamSynchronize(dev_ctx->stream());
#else
cudaStreamSynchronize(dev_ctx->stream());
// async, return stream
if (nullptr != exec_stream) {
*(static_cast<cudaStream_t *>(exec_stream)) = dev_ctx->stream();
// async with callback
} else if (cb) {
cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
// sync
} else {
cudaStreamSynchronize(dev_ctx->stream());
}
#endif
#else
PADDLE_THROW(paddle::platform::errors::Unavailable(
Expand Down Expand Up @@ -261,19 +273,76 @@ void Tensor::CopyToCpu(T *data) {
"The analysis predictor supports CPU, GPU, NPU and XPU now."));
}
}

template <typename T>
void Tensor::CopyToCpu(T *data) const {
CopyToCpuImpl<T>(data, nullptr, nullptr, nullptr);
}

template <typename T>
void Tensor::CopyToCpuAsync(T *data, void *exec_stream) const {
CopyToCpuImpl<T>(data, exec_stream, nullptr, nullptr);
}

template <typename T>
void Tensor::CopyToCpuAsync(T *data, CallbackFunc cb, void *cb_params) const {
CopyToCpuImpl<T>(data, nullptr, cb, cb_params);
}

template PD_INFER_DECL void Tensor::CopyFromCpu<float>(const float *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<int64_t>(const int64_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<int32_t>(const int32_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<uint8_t>(const uint8_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<int8_t>(const int8_t *data);
template PD_INFER_DECL void Tensor::CopyFromCpu<float16>(const float16 *data);

template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data);
template PD_INFER_DECL void Tensor::CopyToCpu<int64_t>(int64_t *data);
template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data);
template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data);
template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data);
template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data);
template PD_INFER_DECL void Tensor::CopyToCpu<float>(float *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<int64_t>(int64_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<int32_t>(int32_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<uint8_t>(uint8_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<int8_t>(int8_t *data) const;
template PD_INFER_DECL void Tensor::CopyToCpu<float16>(float16 *data) const;

template PD_INFER_DECL void Tensor::CopyToCpuImpl<float>(float *data,
void *exec_stream,
CallbackFunc cb,
void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<int64_t>(
int64_t *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<int32_t>(
int32_t *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<uint8_t>(
uint8_t *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<int8_t>(
int8_t *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuImpl<float16>(
float16 *data, void *exec_stream, CallbackFunc cb, void *cb_params) const;

template PD_INFER_DECL void Tensor::CopyToCpuAsync<float>(
float *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int64_t>(
int64_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int32_t>(
int32_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<uint8_t>(
uint8_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
int8_t *data, void *exec_stream) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
float16 *data, void *exec_stream) const;

template PD_INFER_DECL void Tensor::CopyToCpuAsync<float>(
float *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int64_t>(
int64_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int32_t>(
int32_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<uint8_t>(
uint8_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<int8_t>(
int8_t *data, CallbackFunc cb, void *cb_params) const;
template PD_INFER_DECL void Tensor::CopyToCpuAsync<float16>(
float16 *data, CallbackFunc cb, void *cb_params) const;

template PD_INFER_DECL float *Tensor::data<float>(PlaceType *place,
int *size) const;
Expand All @@ -285,12 +354,15 @@ template PD_INFER_DECL uint8_t *Tensor::data<uint8_t>(PlaceType *place,
int *size) const;
template PD_INFER_DECL int8_t *Tensor::data<int8_t>(PlaceType *place,
int *size) const;
template PD_INFER_DECL float16 *Tensor::data<float16>(PlaceType *place,
int *size) const;

template PD_INFER_DECL float *Tensor::mutable_data<float>(PlaceType place);
template PD_INFER_DECL int64_t *Tensor::mutable_data<int64_t>(PlaceType place);
template PD_INFER_DECL int32_t *Tensor::mutable_data<int32_t>(PlaceType place);
template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);

Tensor::Tensor(void *scope) : scope_{scope} {
PADDLE_ENFORCE_NOT_NULL(scope_,
Expand Down
190 changes: 190 additions & 0 deletions paddle/fluid/inference/api/paddle_infer_contrib.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/inference/api/paddle_infer_contrib.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"

namespace paddle_infer {
namespace contrib {

using paddle::PaddleDType;

void* TensorUtils::CudaMallocPinnedMemory(size_t size) {
#if defined(PADDLE_WITH_CUDA)
void* ptr = nullptr;
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMallocHost(&ptr, size));
return ptr;
#else
return nullptr;
#endif
}

void TensorUtils::CudaFreePinnedMemory(void* ptr) {
#if defined(PADDLE_WITH_CUDA)
PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(ptr));
#endif
}

void TensorUtils::CopyTensorImpl(Tensor* p_dst, const Tensor& src,
void* exec_stream, CallbackFunc cb,
void* cb_params) {
Tensor& dst = *p_dst;
dst.Reshape(src.shape());
PADDLE_ENFORCE(
src.place() == PlaceType::kCPU || src.place() == PlaceType::kGPU,
paddle::platform::errors::InvalidArgument(
"CopyTensor only support PlaceType kCPU/kGPU now."));
PADDLE_ENFORCE(
dst.place() == PlaceType::kCPU || dst.place() == PlaceType::kGPU,
paddle::platform::errors::InvalidArgument(
"CopyTensor only support PlaceType kCPU/kGPU now."));
// copy to cpu, gpu => cpu or cpu => cpu
if (dst.place() == PlaceType::kCPU) {
switch (src.type()) {
case PaddleDType::INT32:
src.CopyToCpuImpl(dst.mutable_data<int32_t>(PlaceType::kCPU),
exec_stream, cb, cb_params);
break;
case PaddleDType::INT64:
src.CopyToCpuImpl(dst.mutable_data<int64_t>(PlaceType::kCPU),
exec_stream, cb, cb_params);
break;
case PaddleDType::FLOAT32:
src.CopyToCpuImpl(dst.mutable_data<float>(PlaceType::kCPU), exec_stream,
cb, cb_params);
break;
case PaddleDType::UINT8:
src.CopyToCpuImpl(dst.mutable_data<uint8_t>(PlaceType::kCPU),
exec_stream, cb, cb_params);
break;
case PaddleDType::INT8:
src.CopyToCpuImpl(dst.mutable_data<int8_t>(PlaceType::kCPU),
exec_stream, cb, cb_params);
break;
case PaddleDType::FLOAT16:
src.CopyToCpuImpl(
dst.mutable_data<paddle::platform::float16>(PlaceType::kCPU),
exec_stream, cb, cb_params);
break;
default:
PADDLE_THROW(paddle::platform::errors::Unimplemented(
"Only INT32, INT64, UINT8, INT8, FLOAT16 and "
"FLOAT32 is supported in Tensor. Others not implements"));
}
// gpu => gpu or cpu => gpu
} else {
#if defined(PADDLE_WITH_CUDA)
void* dst_data = nullptr;
void* src_data = nullptr;
size_t data_len = 0;
int data_size = 0;
PlaceType src_place;
switch (src.type()) {
case PaddleDType::INT32:
dst_data =
static_cast<void*>(dst.mutable_data<int32_t>(PlaceType::kGPU));
src_data =
static_cast<void*>(src.data<int32_t>(&src_place, &data_size));
data_len = data_size * sizeof(int32_t);
break;
case PaddleDType::INT64:
dst_data =
static_cast<void*>(dst.mutable_data<int64_t>(PlaceType::kGPU));
src_data =
static_cast<void*>(src.data<int64_t>(&src_place, &data_size));
data_len = data_size * sizeof(int64_t);
break;
case PaddleDType::FLOAT32:
dst_data = static_cast<void*>(dst.mutable_data<float>(PlaceType::kGPU));
src_data = static_cast<void*>(src.data<float>(&src_place, &data_size));
data_len = data_size * sizeof(float);
break;
case PaddleDType::UINT8:
dst_data =
static_cast<void*>(dst.mutable_data<uint8_t>(PlaceType::kGPU));
src_data =
static_cast<void*>(src.data<uint8_t>(&src_place, &data_size));
data_len = data_size * sizeof(uint8_t);
break;
case PaddleDType::INT8:
dst_data =
static_cast<void*>(dst.mutable_data<int8_t>(PlaceType::kGPU));
src_data = static_cast<void*>(src.data<int8_t>(&src_place, &data_size));
data_len = data_size * sizeof(int8_t);
break;
case PaddleDType::FLOAT16:
dst_data = static_cast<void*>(
dst.mutable_data<paddle::platform::float16>(PlaceType::kGPU));
src_data = static_cast<void*>(
src.data<paddle::platform::float16>(&src_place, &data_size));
data_len = data_size * 2;
break;
default:
PADDLE_THROW(paddle::platform::errors::Unimplemented(
"Only INT32, INT64, UINT8, INT8, FLOAT16 and "
"FLOAT32 is supported in Tensor. Others not implements"));
}

paddle::platform::DeviceContextPool& pool =
paddle::platform::DeviceContextPool::Instance();
paddle::platform::CUDAPlace gpu_place(dst.device_);
auto* dev_ctx = static_cast<const paddle::platform::CUDADeviceContext*>(
pool.Get(gpu_place));

if (src.place() == PlaceType::kCPU) {
paddle::memory::Copy(gpu_place, static_cast<void*>(dst_data),
paddle::platform::CPUPlace(), src_data, data_len,
dev_ctx->stream());
} else {
paddle::memory::Copy(gpu_place, static_cast<void*>(dst_data),
paddle::platform::CUDAPlace(), src_data, data_len,
dev_ctx->stream());
}

if (nullptr != exec_stream) {
*(static_cast<cudaStream_t*>(exec_stream)) = dev_ctx->stream();
} else if (cb) {
cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
} else {
cudaStreamSynchronize(dev_ctx->stream());
}
#else
PADDLE_THROW(paddle::platform::errors::Unavailable(
"Can not copy tensor to GPU CUDA place because paddle is not compiled "
"with CUDA."));
#endif
}
return;
}

void TensorUtils::CopyTensor(Tensor* p_dst, const Tensor& src) {
CopyTensorImpl(p_dst, src, nullptr, nullptr, nullptr);
}

void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src,
void* exec_stream) {
CopyTensorImpl(p_dst, src, exec_stream, nullptr, nullptr);
}

void TensorUtils::CopyTensorAsync(Tensor* p_dst, const Tensor& src,
CallbackFunc cb, void* cb_params) {
CopyTensorImpl(p_dst, src, nullptr, cb, cb_params);
}

} // namespace contrib
} // namespace paddle_infer
40 changes: 40 additions & 0 deletions paddle/fluid/inference/api/paddle_infer_contrib.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "paddle/fluid/inference/api/paddle_inference_api.h"

namespace paddle_infer {
namespace contrib {

class TensorUtils {
public:
static void* CudaMallocPinnedMemory(size_t size);
static void CudaFreePinnedMemory(void* mem);

static void CopyTensor(Tensor* p_dst, const Tensor& src);
static void CopyTensorAsync(Tensor* p_dst, const Tensor& src,
void* exec_stream);
static void CopyTensorAsync(Tensor* p_dst, const Tensor& src, CallbackFunc cb,
void* cb_params);

private:
static void CopyTensorImpl(Tensor* p_dst, const Tensor& src,
void* exec_stream, CallbackFunc cb,
void* cb_params);
};

} // namespace contrib
} // namespace paddle_infer
Loading

0 comments on commit 6116f9a

Please sign in to comment.