From fda92a765dbd225d39ef10eb1137bd7e6d8fe3a5 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Thu, 14 Sep 2023 17:06:21 +0800 Subject: [PATCH] update --- paddle/phi/kernels/funcs/broadcast_function.h | 374 +++++++++++------- 1 file changed, 224 insertions(+), 150 deletions(-) diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 95789f555e6cfa..9ac1d819096fe6 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -29,74 +29,86 @@ namespace funcs { #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) -enum BroadcastLoadType { kMixed = 1, kBroadcast = 2, kElementwise = 3 }; - -template -struct UseBroadcast { - template - static HOSTDEVICE void Apply( - const std::vector &ins_tensor, - const ArgsT &args, - int64_t numel, - Array1 *ins_data, - Array2 *use_broadcast, - int *broadcast_num, - bool *all_elementwise) { - (*ins_data)[Index] = (const _ptr_ char *)(ins_tensor[Index]->data()); - bool is_same_dim = ins_tensor[Index]->numel() == numel; - if (is_same_dim) { - (*use_broadcast)[Index] = false; - } else { - (*use_broadcast)[Index] = true; - (*broadcast_num)++; - } - *all_elementwise &= is_same_dim; - } -}; +enum BroadcastType { kMixed = 1, kBroadcast = 2, kElementwise = 3 }; -template -struct LoaderTypeClassifier { - public: +template +struct BroadcastTypeClassifier { int64_t numel{0}; - int vec_size{4}; - int broadcast_num{0}; - bool all_elementwise{true}; - phi::Array use_broadcast; + int broadcast_num{0}; // Not used for XPU + bool all_elementwise{true}; // Not used for XPU + phi::Array use_broadcast; // Not used for XPU + phi::Array configs; phi::Array ins_data; + phi::Array<_ptr_ OutT *, NumOuts> outs_data; + + BroadcastTypeClassifier() {} + BroadcastTypeClassifier(const std::vector &ins, + std::vector *outs, + int axis) { + numel = (*outs)[0]->numel(); + +#ifndef PADDLE_WITH_XPU_KP + for (size_t i = 0; i < ins.size(); ++i) { + bool is_same_dim = ins[i]->numel() == numel; + if (is_same_dim) { + use_broadcast[i] = false; + } else { + use_broadcast[i] = true; + broadcast_num++; + } + all_elementwise &= is_same_dim; + } +#endif + + InitBroadcastConfigs(ins, outs, axis); - LoaderTypeClassifier() {} - LoaderTypeClassifier(const std::vector &ins, - std::vector *outs) { using Traits = phi::funcs::FunctionTraits; using ArgsT = typename Traits::ArgsTuple; ArgsT arg; - uint64_t out_addr = reinterpret_cast((*outs)[0]->data()); - - UnrollerWithoutVecSize::step(ins, arg, &vec_size); - - for (auto i = 1; i < outs->size(); ++i) { - PADDLE_ENFORCE_EQ( - (*outs)[i]->dims(), - (*outs)[0]->dims(), - phi::errors::InvalidArgument( - "The shape of each output tensor shall be identical yet, but " - "%d-th output tensor`s shape is not.", - i)); - out_addr = - (out_addr | reinterpret_cast((*outs)[i]->data())); + UnrollerWithoutVecSize::step(ins, arg, &ins_data); + for (int i = 0; i < NumOuts; ++i) { + outs_data[i] = (*outs)[i]->data(); } + } - vec_size = std::min( - vec_size, - phi::GetVectorizedSize(reinterpret_cast(out_addr))); - numel = (*outs)[0]->numel(); - UnrollerWithoutVecSize::step(ins, - arg, - numel, - &ins_data, - &use_broadcast, - &broadcast_num, - &all_elementwise); + void InitBroadcastConfigs(const std::vector &ins, + std::vector *outs, + int axis) { +#ifdef PADDLE_WITH_XPU_KP + const auto dims_simplifier = + BroadcastDimsSimplifier(ins, (*outs)[0]->dims(), axis); + if (VLOG_IS_ON(6)) { + DimsSimplifiedLogger::Log( + ins, outs, dims_simplifier, "BroadcastKernel"); + } + configs[0] = kps::details::BroadcastConfig(dims_simplifier.out_dims, + dims_simplifier.in_dims[0], + dims_simplifier.in_dims[1], + dims_simplifier.rank); + configs[1] = kps::details::BroadcastConfig(dims_simplifier.out_dims, + dims_simplifier.in_dims[1], + dims_simplifier.in_dims[0], + dims_simplifier.rank); +#else + if (!all_elementwise) { + const auto dims_simplifier = + BroadcastDimsSimplifier(ins, (*outs)[0]->dims(), axis); + if (VLOG_IS_ON(6)) { + DimsSimplifiedLogger::Log( + ins, outs, dims_simplifier, "BroadcastKernel"); + } + for (int i = 0; i < Arity; ++i) { + // if data shape is[m, n], then you should set data_dim = {n, m} + // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3} + // if (ins[i]->numel() != (*outs)[0]->numel()) { + if (ins[i]->numel()) { + configs[i] = kps::details::BroadcastConfig(dims_simplifier.out_dims, + dims_simplifier.in_dims[i], + dims_simplifier.rank); + } + } + } +#endif } }; @@ -425,17 +437,10 @@ __global__ void VectorizedBroadcastKernel( template void LaunchBroadcastKernel( const KPDevice &ctx, - const std::vector &ins, - std::vector *outs, - Functor func, - const phi::Array &configs, - const LoaderTypeClassifier &loader_classifier) { + const BroadcastTypeClassifier &classifier, + Functor func) { #ifdef PADDLE_WITH_XPU_KP - phi::Array<_ptr_ OutT *, NumOuts> outs_data; - for (int i = 0; i < NumOuts; ++i) { - outs_data[i] = (_ptr_ OutT *)(ctx.Alloc((*outs)[i])); - } - int numel = (*outs)[0]->numel(); + int numel = classifier.numel; const int threads = 64; const int blocks = 8; int read_lens = configs[0].buf_len; @@ -444,21 +449,17 @@ void LaunchBroadcastKernel( int tail_tid = numel % (read_lens * threads); VectorizedBroadcastKernel - <<>>(loader_classifier.ins_data, - outs_data, - loader_classifier.use_broadcast, + <<>>(classifier.ins_data, + classifier.outs_data, + classifier.use_broadcast, numel, - configs, + classifier.configs, main_offset, tail_tid, read_lens, func); #else - phi::Array<_ptr_ OutT *, NumOuts> outs_data; - for (int i = 0; i < NumOuts; ++i) { - outs_data[i] = (*outs)[i]->data(); - } - const auto &numel = loader_classifier.numel; + const auto &numel = classifier.numel; auto gpu_config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize); auto stream = ctx.stream(); @@ -467,41 +468,41 @@ void LaunchBroadcastKernel( int main_offset = (numel / (VecSize * threads)) * VecSize * threads; int tail_tid = numel % (VecSize * threads); - if (loader_classifier.all_elementwise) { + if (classifier.all_elementwise) { VectorizedBroadcastKernel - <<>>(loader_classifier.ins_data, - outs_data, - loader_classifier.use_broadcast, + <<>>(classifier.ins_data, + classifier.outs_data, + classifier.use_broadcast, numel, - configs, + classifier.configs, main_offset, tail_tid, VecSize, func); - } else if (loader_classifier.broadcast_num > (Arity >> 1)) { - constexpr BroadcastLoadType type_ = (Arity > 1) ? kBroadcast : kMixed; + } else if (classifier.broadcast_num > (Arity >> 1)) { + constexpr BroadcastType type_ = (Arity > 1) ? kBroadcast : kMixed; VectorizedBroadcastKernel - <<>>(loader_classifier.ins_data, - outs_data, - loader_classifier.use_broadcast, + <<>>(classifier.ins_data, + classifier.outs_data, + classifier.use_broadcast, numel, - configs, + classifier.configs, main_offset, tail_tid, VecSize, func); } else { VectorizedBroadcastKernel - <<>>(loader_classifier.ins_data, - outs_data, - loader_classifier.use_broadcast, + <<>>(classifier.ins_data, + classifier.outs_data, + classifier.use_broadcast, numel, - configs, + classifier.configs, main_offset, tail_tid, VecSize, @@ -635,9 +636,13 @@ struct LaunchBroadcastKernelWithInt64IndexHelper *outs, int axis, Functor functor) { + using Traits = phi::funcs::FunctionTraits; + using ArgsT = typename Traits::ArgsTuple; + ArgsT arg; phi::Array::kValue> ins_ptrs; - UnrollerWithoutVecSize::step(ins, &ins_ptrs); + UnrollerWithoutVecSize::step(ins, arg, &ins_ptrs); + auto *out_tensor = (*outs)[0]; auto *out_ptr = ctx.Alloc(out_tensor); @@ -818,81 +823,128 @@ struct LaunchBroadcastKernelWithInt64IndexHelper -void BroadcastKernelForDifferentVecSize( - const KPDevice &ctx, - const std::vector &ins, - std::vector *outs, - int axis, - Functor func) { - phi::Array configs; -#ifdef PADDLE_WITH_XPU_KP - PADDLE_ENFORCE_EQ( - ins.size(), - 2, - phi::errors::InvalidArgument( - "XPU only support inputs is 2, but received %d", ins.size())); - - auto loader_classifier = LoaderTypeClassifier(); - const auto dims_simplifier = - BroadcastDimsSimplifier(ins, (*outs)[0]->dims(), axis); - if (VLOG_IS_ON(6)) { - DimsSimplifiedLogger::Log( - ins, outs, dims_simplifier, "XPU Broadcast"); +template +typename std::enable_if::value, void>::type +BroadcastKernelForDifferentVecSize(const KPDevice &ctx, + const std::vector &ins, + std::vector *outs, + int axis, + Functor func) { +#ifndef PADDLE_WITH_XPU_KP + constexpr bool kEnabledInt64IndexKernel = (NumOuts == 1 && Arity <= 3); + bool use_int64_index_kernel = + kEnabledInt64IndexKernel && + (*outs)[0]->numel() >= std::numeric_limits::max(); + if (use_int64_index_kernel) { + LaunchBroadcastKernelWithInt64IndexHelper::Run(ctx, + ins, + outs, + axis, + func); + return; } - configs[0] = kps::details::BroadcastConfig(dims_simplifier.out_dims, - dims_simplifier.in_dims[0], - dims_simplifier.in_dims[1], - dims_simplifier.rank); - configs[1] = kps::details::BroadcastConfig(dims_simplifier.out_dims, - dims_simplifier.in_dims[1], - dims_simplifier.in_dims[0], - dims_simplifier.rank); +#endif + + auto classifier = + BroadcastTypeClassifier(ins, outs, axis); + LaunchBroadcastKernel( + ctx, classifier, func); +} + +template +typename std::enable_if::value, void>::type +BroadcastKernelForDifferentVecSize(const KPDevice &ctx, + const std::vector &ins, + std::vector *outs, + int axis, + Functor func) { +#ifdef PADDLE_WITH_XPU_KP auto type = kps::details::OptType::CanNotOptimize; - bool is_optimize = configs[0].cmp_type != type; + bool is_optimize = classifier.configs[0].cmp_type != type; int vec_size = is_optimize ? VecSizeL : VecSizeM; #else - auto loader_classifier = - LoaderTypeClassifier(ins, outs); - if (!loader_classifier.all_elementwise) { - const auto dims_simplifier = - BroadcastDimsSimplifier(ins, (*outs)[0]->dims(), axis); + // Calculate the max vec_size for all ins and outs. + int vec_size = GetVectorizedSizeForTensors(ins, *outs); +#endif - if (VLOG_IS_ON(6)) { - DimsSimplifiedLogger::Log( - ins, outs, dims_simplifier, "GPU Broadcast"); - } - for (int i = 0; i < kArity; ++i) { - // if data shape is[m, n], then you should set data_dim = {n, m} - // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3} - // if (ins[i]->numel() != (*outs)[0]->numel()) { - if (ins[i]->numel()) { - configs[i] = kps::details::BroadcastConfig(dims_simplifier.out_dims, - dims_simplifier.in_dims[i], - dims_simplifier.rank); +#ifndef PADDLE_WITH_XPU_KP + constexpr bool kEnabledInt64IndexKernel = (NumOuts == 1 && Arity <= 3); + bool use_int64_index_kernel = + kEnabledInt64IndexKernel && + (*outs)[0]->numel() >= std::numeric_limits::max(); + if (use_int64_index_kernel) { + switch (vec_size) { + case VecSizeL: { + LaunchBroadcastKernelWithInt64IndexHelper::Run(ctx, + ins, + outs, + axis, + func); + break; + } + case VecSizeM: { + LaunchBroadcastKernelWithInt64IndexHelper::Run(ctx, + ins, + outs, + axis, + func); + break; + } + case VecSizeS: { + LaunchBroadcastKernelWithInt64IndexHelper::Run(ctx, + ins, + outs, + axis, + func); + break; + } + default: { + PADDLE_THROW(phi::errors::Unimplemented( + "Unsupported vectorized size: %d!", vec_size)); + break; } } + return; } #endif - switch (loader_classifier.vec_size) { + + auto classifier = + BroadcastTypeClassifier(ins, outs, axis); + switch (vec_size) { case VecSizeL: { - LaunchBroadcastKernel( - ctx, ins, outs, func, configs, loader_classifier); + LaunchBroadcastKernel( + ctx, classifier, func); break; } case VecSizeM: { - LaunchBroadcastKernel( - ctx, ins, outs, func, configs, loader_classifier); + LaunchBroadcastKernel( + ctx, classifier, func); break; } case VecSizeS: { - LaunchBroadcastKernel( - ctx, ins, outs, func, configs, loader_classifier); + LaunchBroadcastKernel( + ctx, classifier, func); break; } default: { PADDLE_THROW(phi::errors::Unimplemented( - "Unsupported vectorized size: %d!", loader_classifier.vec_size)); + "Unsupported vectorized size: %d!", vec_size)); break; } } @@ -917,7 +969,7 @@ void BroadcastKernelApply(const KPDevice &ctx, (*outs)[0]->numel() >= compute_size && (!loader_classifier.all_elementwise); - if (use_int64_index_kernel) { // use_int64_index_kernel) { + if (use_int64_index_kernel) { // use_int64_index_kernel const auto dims_simplifier = BroadcastDimsSimplifier(ins, (*outs)[0]->dims(), axis); if (VLOG_IS_ON(6)) { @@ -1109,6 +1161,15 @@ void BroadcastKernel(const KPDevice &ctx, // maximum rank of all inputs. using Traits = phi::funcs::FunctionTraits; const int kArity = Traits::arity; + +#ifdef PADDLE_WITH_XPU_KP + PADDLE_ENFORCE_EQ( + ins.size(), + 2, + phi::errors::InvalidArgument( + "XPU only support inputs is 2, but received %d", ins.size())); +#endif + PADDLE_ENFORCE_EQ( ins.size(), kArity, @@ -1129,6 +1190,19 @@ void BroadcastKernel(const KPDevice &ctx, outs->size(), NumOuts)); + for (auto i = 0; i < outs->size(); ++i) { + if (i > 0) { + PADDLE_ENFORCE_EQ( + (*outs)[i]->dims(), + (*outs)[0]->dims(), + phi::errors::InvalidArgument( + "The shape of each output tensor shall be identical yet, but " + "%d-th output tensor`s shape is not.", + i)); + } + ctx.template Alloc((*outs)[i]); + } + int max_rank = 0; int min_rank = phi::DDim::kMaxRank; for (auto *in : ins) {