Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support multi-outputs feature for broadcast ops #38329

Merged
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,8 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
}
}
} else { // blockDim,x * NX < num
const int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
const int kVectorsPerThread = NX / kVectorSize;
constexpr int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
constexpr int kVectorsPerThread = NX / kVectorSize;
int thread_offset = threadIdx.x * kVectorsPerThread;

using VecType = details::VectorType<T, kVectorSize>;
Expand Down Expand Up @@ -441,8 +441,8 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
}
} else {
// Vector type
const int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
const int kVectorsPerThread = NX / kVectorSize;
constexpr int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

constexpr 有性能影响吗 (c++14 支持) ?理论上 const 编译器也会在编译时计算

Copy link
Contributor Author

@JamesLim-sy JamesLim-sy Dec 24, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个修改是从读代码的角度出发的,切换成constexpr的话,可以显式认定是编译期计算

constexpr int kVectorsPerThread = NX / kVectorSize;

int thread_offset = threadIdx.x * kVectorsPerThread;
using VecType = details::VectorType<T, kVectorSize>;
Expand Down
135 changes: 97 additions & 38 deletions paddle/pten/kernels/hybird/cuda/elementwise/elementwise_broadcast.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,12 +193,13 @@ template <typename InT,
typename OutT,
typename Functor,
int Arity,
int NumOuts,
int VecSize,
int Rank,
bool IsBoundary = false>
__device__ void ElementwiseBroadcastKernelImpl(
const paddle::framework::Array<const InT *__restrict__, Arity> &ins,
OutT *out,
paddle::framework::Array<OutT *, NumOuts> outs,
const paddle::framework::Array<bool, Arity> &use_broadcast,
uint32_t numel,
const paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity>
Expand All @@ -207,7 +208,7 @@ __device__ void ElementwiseBroadcastKernelImpl(
int block_offset,
Functor func) {
InT args[Arity][VecSize];
OutT result[VecSize];
OutType<OutT, NumOuts> result[VecSize];

#pragma unroll
for (int i = 0; i < Arity; i++) {
Expand All @@ -220,28 +221,29 @@ __device__ void ElementwiseBroadcastKernelImpl(
num,
use_broadcast[i]);
}

const bool kCallElementwiseAny =
constexpr bool kCallElementwiseAny =
paddle::platform::FunctionTraits<Functor>::has_pointer_args;
ElementwisePrimitiveCaller<InT,
OutT,
OutType<OutT, NumOuts>,
VecSize,
Functor,
Arity,
kCallElementwiseAny>()(func, args, result);
kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
out + block_offset, result, num);

ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, NumOuts>()(
outs, result, block_offset, num);
}

template <typename InT,
typename OutT,
typename Functor,
int Arity,
int NumOuts,
int VecSize,
int Rank>
__global__ void ElementwiseBroadcastKernel(
paddle::framework::Array<const InT *__restrict__, Arity> ins,
OutT *out,
paddle::framework::Array<OutT *, NumOuts> outs,
paddle::framework::Array<bool, Arity> use_broadcast,
uint32_t numel,
paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity>
Expand All @@ -251,16 +253,18 @@ __global__ void ElementwiseBroadcastKernel(
Functor func) {
int block_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;

#ifdef PADDLE_WITH_XPU2
for (; block_offset < main_offset; block_offset += stride) {
ElementwiseBroadcastKernelImpl<InT,
OutT,
Functor,
Arity,
NumOuts,
VecSize,
Rank,
false>(ins,
out,
outs,
use_broadcast,
numel,
configs,
Expand All @@ -273,22 +277,23 @@ __global__ void ElementwiseBroadcastKernel(
OutT,
Functor,
Arity,
NumOuts,
VecSize,
Rank,
true>(
ins, out, use_broadcast, numel, configs, tail_tid, block_offset, func);
ins, outs, use_broadcast, numel, configs, tail_tid, block_offset, func);
}

#else
if (block_offset < main_offset) {
ElementwiseBroadcastKernelImpl<InT,
OutT,
Functor,
Arity,
NumOuts,
VecSize,
Rank,
false>(ins,
out,
outs,
use_broadcast,
numel,
configs,
Expand All @@ -300,10 +305,11 @@ __global__ void ElementwiseBroadcastKernel(
OutT,
Functor,
Arity,
NumOuts,
VecSize,
Rank,
true>(
ins, out, use_broadcast, numel, configs, tail_tid, block_offset, func);
ins, outs, use_broadcast, numel, configs, tail_tid, block_offset, func);
}
#endif
}
Expand All @@ -312,25 +318,30 @@ template <typename InT,
typename OutT,
typename Functor,
int Arity,
int NumOuts,
int VecSize,
int Rank>
void LaunchKernel(const paddle::platform::CUDADeviceContext &ctx,
const std::vector<const DenseTensor *> &ins,
DenseTensor *out,
std::vector<DenseTensor *> *outs,
Functor func,
DimensionsTransform merge_dims) {
int numel = out->numel();
int numel = (*outs)[0]->numel();
const int threads = 256;
int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;

int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
int tail_tid = numel % (VecSize * threads);
auto stream = ctx.stream();
OutT *out_data = out->mutable_data<OutT>();

paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity> configs;
paddle::framework::Array<bool, Arity> use_broadcast;
paddle::framework::Array<const InT *__restrict__, Arity> ins_data;
paddle::framework::Array<OutT *, NumOuts> outs_data;

for (int i = 0; i < NumOuts; ++i) {
outs_data[i] = (*outs)[i]->mutable_data<OutT>();
}

for (int i = 0; i < Arity; i++) {
use_broadcast[i] = (ins[i]->numel() != numel);
Expand All @@ -343,6 +354,7 @@ void LaunchKernel(const paddle::platform::CUDADeviceContext &ctx,
merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size);
}
}

#ifdef PADDLE_WITH_XPU2
threads = 128;
blocks = 8;
Expand All @@ -352,9 +364,10 @@ void LaunchKernel(const paddle::platform::CUDADeviceContext &ctx,
OutT,
Functor,
Arity,
NumOuts,
VecSize,
Rank><<<blocks, threads, stream>>>(ins_data,
out_data,
outs_data,
use_broadcast,
numel,
configs,
Expand All @@ -366,10 +379,11 @@ void LaunchKernel(const paddle::platform::CUDADeviceContext &ctx,
OutT,
Functor,
Arity,
NumOuts,
VecSize,
Rank><<<blocks, threads, 0, stream>>>(
ins_data,
out_data,
outs_data,
use_broadcast,
numel,
configs,
Expand All @@ -379,19 +393,24 @@ void LaunchKernel(const paddle::platform::CUDADeviceContext &ctx,
#endif
}

template <typename InT, typename OutT, typename Functor, int Arity, int VecSize>
template <typename InT,
typename OutT,
typename Functor,
int Arity,
int NumOuts,
int VecSize>
void LaunchBroadcastKernelForDifferentVecSize(
const paddle::platform::CUDADeviceContext &ctx,
const std::vector<const DenseTensor *> &ins,
DenseTensor *out,
std::vector<DenseTensor *> *outs,
int axis,
Functor func) {
const auto merge_dims = DimensionsTransform(ins, out->dims(), axis);
const auto merge_dims = DimensionsTransform(ins, (*outs)[0]->dims(), axis);

#define CALL_BROADCAST_FOR_DIM_SIZE(rank) \
case rank: { \
LaunchKernel<InT, OutT, Functor, Arity, VecSize, rank>( \
ctx, ins, out, func, merge_dims); \
#define CALL_BROADCAST_FOR_DIM_SIZE(rank) \
case rank: { \
LaunchKernel<InT, OutT, Functor, Arity, NumOuts, VecSize, rank>( \
ctx, ins, outs, func, merge_dims); \
} break;

switch (merge_dims.dim_size) {
Expand All @@ -414,7 +433,11 @@ void LaunchBroadcastKernelForDifferentVecSize(
#undef CALL_BROADCAST_FOR_DIM_SIZE
}

template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
template <ElementwiseType ET,
typename InT,
typename OutT,
typename Functor,
int NumOuts = 1>
void LaunchBroadcastElementwiseCudaKernel(
const paddle::platform::CUDADeviceContext &ctx,
const std::vector<const DenseTensor *> &ins,
Expand All @@ -438,32 +461,68 @@ void LaunchBroadcastElementwiseCudaKernel(
"Currently only broadcast of binary is supported and "
"verified, but received %d.",
kArity));

PADDLE_ENFORCE_EQ(
outs->size(),
NumOuts,
paddle::platform::errors::InvalidArgument(
"Number of outputs shall equal to number of functions, "
"but number of outputs is %d, number of functions is %d.",
outs->size(),
NumOuts));
int in_vec_size = 4;
DenseTensor *out = (*outs)[0];
int out_vec_size = 4;
if (NumOuts > 1) {
for (int i = 0; i < NumOuts; ++i) {
PADDLE_ENFORCE_EQ(
(*outs)[i]->dims(),
(*outs)[0]->dims(),
paddle::platform::errors::InvalidArgument(
"The shape of each output tensor shall be identical yet, but "
"%dth output tensor`s shape is not.",
i));
out_vec_size = std::min(
paddle::platform::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()),
out_vec_size);
}
} else {
out_vec_size =
paddle::platform::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
}

for (auto *in : ins) {
auto temp_size = paddle::platform::GetVectorizedSize<InT>(in->data<InT>());
in_vec_size = in->dims() == out->dims() ? std::min(temp_size, in_vec_size)
: in_vec_size;
in_vec_size = in->dims() == (*outs)[0]->dims()
? std::min(temp_size, in_vec_size)
: in_vec_size;
}
int out_vec_size =
paddle::platform::GetVectorizedSize<OutT>(out->data<OutT>());
int vec_size = std::min(out_vec_size, in_vec_size);

switch (vec_size) {
case 4: {
LaunchBroadcastKernelForDifferentVecSize<InT, OutT, Functor, kArity, 4>(
ctx, ins, out, axis, func);
LaunchBroadcastKernelForDifferentVecSize<InT,
OutT,
Functor,
kArity,
NumOuts,
4>(ctx, ins, outs, axis, func);
break;
}
case 2: {
LaunchBroadcastKernelForDifferentVecSize<InT, OutT, Functor, kArity, 2>(
ctx, ins, out, axis, func);
LaunchBroadcastKernelForDifferentVecSize<InT,
OutT,
Functor,
kArity,
NumOuts,
2>(ctx, ins, outs, axis, func);
break;
}
case 1: {
LaunchBroadcastKernelForDifferentVecSize<InT, OutT, Functor, kArity, 1>(
ctx, ins, out, axis, func);
LaunchBroadcastKernelForDifferentVecSize<InT,
OutT,
Functor,
kArity,
NumOuts,
1>(ctx, ins, outs, axis, func);
break;
}
default: {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ namespace pten {
namespace kps = paddle::operators::kernel_primitives;
enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3, kAny = -1 };

/* Packing scalar type T(float, int etc.) into Array<T, NumOuts> type
for supporting multiple-output feature in elementwise system.*/
template <class T, int Num>
using OutType =
typename std::conditional_t<Num == 1, T, paddle::framework::Array<T, Num>>;

template <typename InT,
typename OutT,
int VecSize,
Expand Down Expand Up @@ -76,4 +82,39 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
}
};

template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
struct ElementwiseWriteDataCaller {
__device__ __forceinline__ void operator()(
paddle::framework::Array<OutT *, NumOuts> outs,
OutType<OutT, NumOuts> src[VecSize],
int block_offset,
int num) {
OutT dst[NumOuts][VecSize];
#pragma unroll
for (int i = 0; i < VecSize; ++i) {
#pragma unroll
for (int j = 0; j < NumOuts; ++j) {
dst[j][i] = (src[i])[j];
}
}
#pragma unroll
for (int i = 0; i < NumOuts; ++i) {
kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
outs[i] + block_offset, dst[i], num);
}
}
};

template <typename OutT, int VecSize, bool IsBoundary>
struct ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, 1> {
__device__ __forceinline__ void operator()(
paddle::framework::Array<OutT *, 1> outs,
OutT src[VecSize],
int block_offset,
int num) {
kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
outs[0] + block_offset, src, num);
}
};

} // namespace pten