Skip to content

Commit

Permalink
[PHI decoupling] move softmax from fluid to phi and remove cpu_vec.h …
Browse files Browse the repository at this point in the history
…in fluid (#48970)
  • Loading branch information
huangjiyi authored Dec 15, 2022
1 parent 4672ea8 commit 344b99e
Show file tree
Hide file tree
Showing 65 changed files with 371 additions and 1,074 deletions.
7 changes: 4 additions & 3 deletions paddle/fluid/inference/api/analysis_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
#include "paddle/fluid/inference/utils/table_printer.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/utils/string/split.h"

#ifdef PADDLE_WITH_TENSORRT
Expand Down Expand Up @@ -624,10 +624,11 @@ void AnalysisConfig::EnableMkldnnQuantizer() {

void AnalysisConfig::EnableMkldnnBfloat16() {
#ifdef PADDLE_WITH_MKLDNN
if (platform::MayIUse(platform::cpu_isa_t::avx512_core)) {
if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_core)) {
use_mkldnn_bfloat16_ = true;
LOG(INFO) << "Hardware support for BFLOAT16"
<< (platform::MayIUse(platform::cpu_isa_t::avx512_bf16)
<< (phi::backends::cpu::MayIUse(
phi::backends::cpu::cpu_isa_t::avx512_bf16)
? " is enabled"
: " is disabled. Simulation will be used");
} else {
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/inference/api/analysis_predictor_tester.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"

DEFINE_string(dirname, "", "dirname to tests.");

Expand Down Expand Up @@ -327,7 +327,7 @@ TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
config.EnableUseGpu(100, 0);
config.EnableMkldnnBfloat16();
#ifdef PADDLE_WITH_MKLDNN
if (platform::MayIUse(platform::cpu_isa_t::avx512_core))
if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_core))
ASSERT_EQ(config.mkldnn_bfloat16_enabled(), true);
else
ASSERT_EQ(config.mkldnn_bfloat16_enabled(), false);
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"

DEFINE_string(dirname, "", "dirname to tests.");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ limitations under the License. */

#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"

DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN");

Expand Down Expand Up @@ -47,7 +47,7 @@ TEST(Analyzer_bfloat16_image_classification, bfloat16) {
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInputs(&input_slots_all);
if (FLAGS_enable_mkldnn && FLAGS_enable_bf16 &&
platform::MayIUse(platform::cpu_isa_t::avx512_bf16)) {
phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_bf16)) {
b_cfg.EnableMkldnnBfloat16();
} else {
FLAGS_enable_bf16 = false;
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/memory/allocation/buddy_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ limitations under the License. */

#include "paddle/fluid/memory/allocation/memory_block.h"
#include "paddle/fluid/memory/allocation/system_allocator.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"

namespace paddle {
namespace memory {
Expand Down
12 changes: 6 additions & 6 deletions paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
std::call_once(init_flag, []() {
a = new detail::BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
platform::CpuMinChunkSize(),
platform::CpuMaxChunkSize());
phi::backends::cpu::CpuMinChunkSize(),
phi::backends::cpu::CpuMaxChunkSize());
});

return a;
Expand Down Expand Up @@ -290,8 +290,8 @@ BuddyAllocator *GetNPUPinnedBuddyAllocator() {
std::call_once(init_flag, []() {
ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::NPUPinnedAllocator),
platform::NPUPinnedMinChunkSize(),
platform::NPUPinnedMaxChunkSize());
phi::backends::cpu::NPUPinnedMinChunkSize(),
phi::backends::cpu::NPUPinnedMaxChunkSize());
});

return ba;
Expand Down Expand Up @@ -562,8 +562,8 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
std::call_once(init_flag, []() {
ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::CUDAPinnedAllocator),
platform::CUDAPinnedMinChunkSize(),
platform::CUDAPinnedMaxChunkSize());
phi::backends::cpu::CUDAPinnedMinChunkSize(),
phi::backends::cpu::CUDAPinnedMaxChunkSize());
});

return ba;
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/memory/allocation/system_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ limitations under the License. */
#endif
#include "gflags/gflags.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
Expand Down Expand Up @@ -206,7 +206,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
// of host pinned allocation. Allocates too much would reduce
// the amount of memory available to the underlying system for paging.
size_t usable =
paddle::platform::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_;
phi::backends::cpu::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_;

if (size > usable) {
LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
Expand Down Expand Up @@ -362,7 +362,7 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr;

size_t usable =
paddle::platform::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_;
phi::backends::cpu::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_;

if (size > usable) {
LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/memory/pinned_memory_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/memory_block.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/backends/cpu/cpu_info.h"

// This unit test is an example comparing the performance between using pinned
// memory and not. In general, using pinned memory will be faster.
Expand Down
19 changes: 10 additions & 9 deletions paddle/fluid/operators/attention_lstm_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License. */

#include <string>

#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
Expand Down Expand Up @@ -315,10 +315,10 @@ use lstm_x_t as input and compute as standard LSTM.
template <typename T>
inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
if (bias) {
phi::funcs::vec_add_bias<T, platform::avx>(n, *bias, x, y);
phi::funcs::vec_relu<T, platform::avx>(n, y, y);
phi::funcs::vec_add_bias<T, phi::backends::cpu::avx>(n, *bias, x, y);
phi::funcs::vec_relu<T, phi::backends::cpu::avx>(n, y, y);
} else {
phi::funcs::vec_relu<T, platform::avx>(n, x, y);
phi::funcs::vec_relu<T, phi::backends::cpu::avx>(n, x, y);
}
}

Expand All @@ -329,8 +329,9 @@ inline void vec_softmax(const int n, const T* x, T* y) {
for (int i = 1; i < n; ++i) {
scalar = scalar < x[i] ? x[i] : scalar;
}
phi::funcs::vec_add_bias<T, platform::avx>(n, -scalar, x, y); // sub
phi::funcs::vec_exp<T>(n, y, y); // exp
phi::funcs::vec_add_bias<T, phi::backends::cpu::avx>(
n, -scalar, x, y); // sub
phi::funcs::vec_exp<T>(n, y, y); // exp
// sum
scalar = T(0);
for (int i = 0; i < n; ++i) {
Expand Down Expand Up @@ -393,13 +394,13 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
if (platform::MayIUse(platform::avx)) {
phi::funcs::VecActivations<T, platform::avx> act_functor;
if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
phi::funcs::VecActivations<T, phi::backends::cpu::avx> act_functor;
act_gate = act_functor(act_gate_str);
act_cell = act_functor(act_cell_str);
act_cand = act_functor(act_cand_str);
} else {
phi::funcs::VecActivations<T, platform::isa_any> act_functor;
phi::funcs::VecActivations<T, phi::backends::cpu::isa_any> act_functor;
act_gate = act_functor(act_gate_str);
act_cell = act_functor(act_cell_str);
act_cand = act_functor(act_cand_str);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
#include "paddle/fluid/operators/math/softmax_impl.h"

#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -129,15 +131,15 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
softmax_2d.ShareDataWith(*softmax).Resize({N, D});
loss_2d.ShareDataWith(*loss).Resize({N, 1});

auto eigen_logits = math::EigenMatrix<T>::From(logits_2d);
auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);
auto eigen_logits = phi::funcs::EigenMatrix<T>::From(logits_2d);
auto eigen_softmax = phi::funcs::EigenMatrix<T>::From(softmax_2d);

// step 1, obtain logit_max
phi::DenseTensor logits_max;
logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
void* logits_max_buff = logits_max.mutable_data<T>(place);

auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
auto eigen_logits_max = phi::funcs::EigenMatrix<T>::From(logits_max);
Eigen::DSizes<int, 1> along_axis(1);
eigen_logits_max.device(*dev_ctx.eigen_device()) =
eigen_logits.maximum(along_axis);
Expand All @@ -158,7 +160,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
eigen_softmax.device(*dev_ctx.eigen_device()) =
(eigen_logits -
eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class))
.unaryExpr(math::ValueClip<T>());
.unaryExpr(phi::funcs::ValueClip<T>());

// step 3, obtain predict target
phi::DenseTensor predicted_logits;
Expand Down Expand Up @@ -217,7 +219,8 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);

auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
auto eigen_sum_exp_logits =
phi::funcs::EigenMatrix<T>::From(sum_exp_logits);
eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) =
eigen_softmax.sum(along_axis);

Expand All @@ -231,8 +234,9 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
comm->comm(),
stream));

auto eigen_loss = math::EigenMatrix<T>::From(loss_2d);
auto eigen_predicted_logits = math::EigenMatrix<T>::From(predicted_logits);
auto eigen_loss = phi::funcs::EigenMatrix<T>::From(loss_2d);
auto eigen_predicted_logits =
phi::funcs::EigenMatrix<T>::From(predicted_logits);

eigen_loss.device(*dev_ctx.eigen_device()) =
(eigen_sum_exp_logits.log().unaryExpr(phi::funcs::TolerableValue<T>()) -
Expand Down Expand Up @@ -281,14 +285,14 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
softmax_2d.ShareDataWith(*softmax).Resize({N, D});
loss_2d.ShareDataWith(*loss).Resize({N, 1});

auto eigen_logits = math::EigenMatrix<T>::From(logits_2d);
auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);
auto eigen_logits = phi::funcs::EigenMatrix<T>::From(logits_2d);
auto eigen_softmax = phi::funcs::EigenMatrix<T>::From(softmax_2d);

// step 1, obtain logit_max
phi::DenseTensor logits_max;
logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);

auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
auto eigen_logits_max = phi::funcs::EigenMatrix<T>::From(logits_max);
Eigen::DSizes<int, 1> along_axis(1);
eigen_logits_max.device(*dev_ctx.eigen_device()) =
eigen_logits.maximum(along_axis);
Expand All @@ -304,7 +308,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
eigen_softmax.device(*dev_ctx.eigen_device()) =
(eigen_logits -
eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class))
.unaryExpr(math::ValueClip<T>());
.unaryExpr(phi::funcs::ValueClip<T>());

// step 3, obtain predict target
phi::DenseTensor predicted_logits;
Expand Down Expand Up @@ -357,7 +361,8 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);

auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
auto eigen_sum_exp_logits =
phi::funcs::EigenMatrix<T>::From(sum_exp_logits);
eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) =
eigen_softmax.sum(along_axis);

Expand All @@ -366,8 +371,9 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
opts.reduce_op = distributed::ReduceOp::SUM;
pg->AllReduce(in_out, in_out, opts)->Synchronize();

auto eigen_loss = math::EigenMatrix<T>::From(loss_2d);
auto eigen_predicted_logits = math::EigenMatrix<T>::From(predicted_logits);
auto eigen_loss = phi::funcs::EigenMatrix<T>::From(loss_2d);
auto eigen_predicted_logits =
phi::funcs::EigenMatrix<T>::From(predicted_logits);

eigen_loss.device(*dev_ctx.eigen_device()) =
(eigen_sum_exp_logits.log().unaryExpr(phi::funcs::TolerableValue<T>()) -
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/softmax.h"

namespace paddle {
namespace operators {
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/elementwise/elementwise_mul_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ limitations under the License. */
#include <string>

#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/kernels/elementwise_kernel.h"

namespace paddle {
Expand Down
8 changes: 4 additions & 4 deletions paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License. */

#include <string>

#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/sequence2batch.h"
Expand Down Expand Up @@ -278,13 +278,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \
if (platform::MayIUse(platform::avx)) { \
phi::funcs::VecActivations<T, platform::avx> act_functor; \
if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) { \
phi::funcs::VecActivations<T, phi::backends::cpu::avx> act_functor; \
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
} else { \
phi::funcs::VecActivations<T, platform::isa_any> act_functor; \
phi::funcs::VecActivations<T, phi::backends::cpu::isa_any> act_functor; \
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
Expand Down
8 changes: 4 additions & 4 deletions paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License. */

#include <string>

#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
Expand Down Expand Up @@ -225,11 +225,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {

std::function<void(const int, const T*, T*)> fc_act;
auto& fc_act_str = ctx.Attr<std::string>("fc_activation");
if (platform::MayIUse(platform::avx)) {
phi::funcs::VecActivations<T, platform::avx> act_functor;
if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
phi::funcs::VecActivations<T, phi::backends::cpu::avx> act_functor;
fc_act = act_functor(fc_act_str);
} else {
phi::funcs::VecActivations<T, platform::isa_any> act_functor;
phi::funcs::VecActivations<T, phi::backends::cpu::isa_any> act_functor;
fc_act = act_functor(fc_act_str);
}

Expand Down
Loading

0 comments on commit 344b99e

Please sign in to comment.