diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 180214970be99..2899119cde9a8 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -23,7 +23,8 @@ set(CBLAS_TAG v0.3.7) # https://github.com/PaddlePaddle/Paddle/pull/52983 if(UNIX AND NOT APPLE - AND NOT WITH_ROCM) + AND NOT WITH_ROCM + AND NOT WITH_XPU) set(CBLAS_TAG v0.3.18) endif() diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 6202d4edf0496..c2be9fab7ee78 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -9,7 +9,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so") set(XPU_XFT_LIB_NAME "libxft.so") set(XPU_BASE_DATE "20230427") -set(XPU_XCCL_BASE_VERSION "1.0.13") +set(XPU_XCCL_BASE_VERSION "1.0.49.2") set(XPU_XFT_BASE_VERSION "latest") if(NOT DEFINED XPU_BASE_URL) @@ -30,35 +30,41 @@ if(NOT XPU_XFT_BASE_URL) ) endif() +if(WITH_XCCL_RDMA) + set(XPU_XCCL_PREFIX "xccl_rdma") +else() + set(XPU_XCCL_PREFIX "xccl_socket") +endif() + if(WITH_AARCH64) set(XPU_XRE_DIR_NAME "xre-kylin_aarch64") set(XPU_XDNN_DIR_NAME "xdnn-kylin_aarch64") - set(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64") + set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-kylin_aarch64") set(XPU_XFT_DIR_NAME "") # TODO: xft has no kylin output at now. elseif(WITH_SUNWAY) set(XPU_XRE_DIR_NAME "xre-deepin_sw6_64") set(XPU_XDNN_DIR_NAME "xdnn-deepin_sw6_64") - set(XPU_XCCL_DIR_NAME "xccl-deepin_sw6_64") + set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-deepin_sw6_64") set(XPU_XFT_DIR_NAME "") # TODO: xft has no deepin output at now. elseif(WITH_BDCENTOS) set(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64") set(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64") - set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") + set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-bdcentos_x86_64") set(XPU_XFT_DIR_NAME "xft_bdcentos6u3_x86_64_gcc82") elseif(WITH_UBUNTU) set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") set(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64") - set(XPU_XCCL_DIR_NAME "xccl-ubuntu_x86_64") + set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-ubuntu_x86_64") set(XPU_XFT_DIR_NAME "xft_ubuntu1604_x86_64") elseif(WITH_CENTOS) set(XPU_XRE_DIR_NAME "xre-centos7_x86_64") set(XPU_XDNN_DIR_NAME "xdnn-centos7_x86_64") - set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") + set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-bdcentos_x86_64") set(XPU_XFT_DIR_NAME "xft_bdcentos6u3_x86_64_gcc82") else() set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") set(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64") - set(XPU_XCCL_DIR_NAME "xccl-ubuntu_x86_64") + set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-ubuntu_x86_64") set(XPU_XFT_DIR_NAME "xft_ubuntu1604_x86_64") endif() @@ -75,9 +81,6 @@ set(XPU_XFT_URL "${XPU_XFT_BASE_URL}/${XPU_XFT_DIR_NAME}.tar.gz") set(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE) -set(XPU_CHECK_DEPENCE_URL - "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/check_xpu_dependence.sh" - CACHE STRING "" FORCE) set(XPU_XFT_GET_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/get_xft_dependence.sh" CACHE STRING "" FORCE) @@ -115,8 +118,8 @@ ExternalProject_Add( PREFIX ${SNAPPY_PREFIX_DIR} DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR} DOWNLOAD_COMMAND - wget ${XPU_CHECK_DEPENCE_URL} && bash check_xpu_dependence.sh - ${XPU_BASE_URL} ${XPU_XCCL_BASE_URL} && wget ${XPU_PACK_DEPENCE_URL} && bash + bash ${CMAKE_SOURCE_DIR}/tools/xpu/check_xpu_dependence.sh ${XPU_BASE_URL} + ${XPU_XCCL_BASE_URL} && wget ${XPU_PACK_DEPENCE_URL} && bash pack_paddle_depence.sh ${XPU_XRE_URL} ${XPU_XRE_DIR_NAME} ${XPU_XDNN_URL} ${XPU_XDNN_DIR_NAME} ${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME} && wget ${XPU_XFT_GET_DEPENCE_URL} && bash get_xft_dependence.sh ${XPU_XFT_URL} diff --git a/paddle/fluid/distributed/collective/process_group_bkcl.cc b/paddle/fluid/distributed/collective/process_group_bkcl.cc index 47dd2241c2cde..a3c3e085c6df5 100644 --- a/paddle/fluid/distributed/collective/process_group_bkcl.cc +++ b/paddle/fluid/distributed/collective/process_group_bkcl.cc @@ -115,7 +115,13 @@ std::shared_ptr ProcessGroupBKCL::Recv( const phi::DenseTensor& input, BKCLContext_t comm, const XPUStream& stream) { - VLOG(3) << "bkcl_recv"; + VLOG(3) << "calling bkcl_recv" + << ", rank_id: " << platform::GetBKCLRankID(comm) + << ", dev_id: " << platform::GetBKCLDevID(comm) + << ", nranks: " << platform::GetBKCLNRanks(comm) + << ", src_rank: " << src_rank << ", numel: " << output->numel() + << ", dtype: " << output->type() << ", sync_op: " << sync_op + << ", use_calc_stream: " << use_calc_stream; int r = bkcl_recv(comm, output->data(), output->numel(), @@ -148,7 +154,14 @@ std::shared_ptr ProcessGroupBKCL::Send( const phi::DenseTensor& input, BKCLContext_t comm, const XPUStream& stream) { - VLOG(3) << "bkcl_send"; + VLOG(3) << "calling bkcl_send" + << ", rank_id: " << platform::GetBKCLRankID(comm) + << ", dev_id: " << platform::GetBKCLDevID(comm) + << ", nranks: " << platform::GetBKCLNRanks(comm) + << ", dst_rank: " << dst_rank + << ", input numel: " << input.numel() + << ", dtype: " << input.type() << ", sync_op: " << sync_op + << ", use_calc_stream: " << use_calc_stream; int r = bkcl_send(comm, input.data(), input.numel(), @@ -276,7 +289,14 @@ std::shared_ptr ProcessGroupBKCL::AllReduce( const phi::DenseTensor& input, BKCLContext_t comm, const XPUStream& stream) { - VLOG(3) << "bkcl_all_reduce"; + VLOG(3) << "calling bkcl_all_reduce" + << ", rank_id: " << platform::GetBKCLRankID(comm) + << ", dev_id: " << platform::GetBKCLDevID(comm) + << ", nranks: " << platform::GetBKCLNRanks(comm) + << ", numel: " << input.numel() << ", dtype: " << input.type() + << ", reduce_type: " << ToBKCLRedType(opts.reduce_op) + << ", sync_op: " << sync_op + << ", use_calc_stream: " << use_calc_stream; int r = bkcl_all_reduce(comm, input.data(), @@ -307,7 +327,13 @@ std::shared_ptr ProcessGroupBKCL::Broadcast( BKCLContext_t comm, const XPUStream& stream) { int root = opts.source_rank + opts.source_root; - VLOG(3) << "bkcl_broadcast"; + VLOG(3) << "calling bkcl_broadcast" + << ", rank_id: " << platform::GetBKCLRankID(comm) + << ", dev_id: " << platform::GetBKCLDevID(comm) + << ", nranks: " << platform::GetBKCLNRanks(comm) + << ", root: " << root << ", numel: " << input.numel() + << ", dtype: " << input.type() << ", sync_op: " << sync_op + << ", use_calc_stream: " << use_calc_stream; int r = bkcl_broadcast(comm, input.data(), @@ -346,7 +372,13 @@ std::shared_ptr ProcessGroupBKCL::AllGather( const phi::DenseTensor& input, BKCLContext_t comm, const XPUStream& stream) { - VLOG(3) << "bkcl_all_gather"; + VLOG(3) << "calling bkcl_all_gather" + << ", rank_id: " << platform::GetBKCLRankID(comm) + << ", dev_id: " << platform::GetBKCLDevID(comm) + << ", nranks: " << platform::GetBKCLNRanks(comm) + << ", numel: " << in_tensor_maybe_partial.numel() + << ", dtype: " << input.type() << ", sync_op: " << sync_op + << ", use_calc_stream: " << use_calc_stream; int r = bkcl_all_gather(comm, in_tensor_maybe_partial.data(), @@ -375,7 +407,15 @@ std::shared_ptr ProcessGroupBKCL::Reduce( const phi::DenseTensor& input, BKCLContext_t comm, const XPUStream& stream) { - VLOG(3) << "bkcl_reduce"; + VLOG(3) << "calling bkcl_reduce" + << ", rank_id: " << platform::GetBKCLRankID(comm) + << ", dev_id: " << platform::GetBKCLDevID(comm) + << ", nranks: " << platform::GetBKCLNRanks(comm) + << ", root: " << opts.root_rank << ", numel: " << input.numel() + << ", dtype: " << input.type() + << ", reduce_type: " << ToBKCLRedType(opts.reduce_op) + << ", sync_op: " << sync_op + << ", use_calc_stream: " << use_calc_stream; int r = bkcl_reduce(comm, input.data(), output->data(), @@ -405,7 +445,14 @@ std::shared_ptr ProcessGroupBKCL::ReduceScatter( const phi::DenseTensor& input, BKCLContext_t comm, const XPUStream& stream) { - VLOG(3) << "bkcl_reduce_scatter"; + VLOG(3) << "calling bkcl_reduce_scatter" + << ", rank_id: " << platform::GetBKCLRankID(comm) + << ", dev_id: " << platform::GetBKCLDevID(comm) + << ", nranks: " << platform::GetBKCLNRanks(comm) + << ", numel: " << output->numel() << ", dtype: " << input.type() + << ", reduce_type: " << ToBKCLRedType(opts.reduce_op) + << ", sync_op: " << sync_op + << ", use_calc_stream: " << use_calc_stream; int r = bkcl_reduce_scatter( comm, input.data(), @@ -491,8 +538,13 @@ std::shared_ptr ProcessGroupBKCL::AllReduce( const phi::DenseTensor& input, BKCLContext_t comm, const XPUStream& stream) { - VLOG(3) << "bkcl_all_reduce"; - + VLOG(3) << "calling bkcl_all_reduce" + << ", rank_id: " << platform::GetBKCLRankID(comm) + << ", dev_id: " << platform::GetBKCLDevID(comm) + << ", nranks: " << platform::GetBKCLNRanks(comm) + << ", numel: " << input.numel() << ", dtype: " << input.type() + << ", reduce_type: " << ToBKCLRedType(opts.reduce_op) + << ", sync_op: " << true << ", use_calc_stream: " << false; int r = bkcl_all_reduce(comm, input.data(), @@ -535,7 +587,13 @@ std::shared_ptr ProcessGroupBKCL::AllReduce( const phi::DenseTensor& input, BKCLContext_t comm, const XPUStream& stream) { - VLOG(3) << "bkcl_all_reduce"; + VLOG(3) << "calling bkcl_all_reduce" + << ", rank_id: " << platform::GetBKCLRankID(comm) + << ", dev_id: " << platform::GetBKCLDevID(comm) + << ", nranks: " << platform::GetBKCLNRanks(comm) + << ", numel: " << input.numel() << ", dtype: " << input.type() + << ", reduce_type: " << ToBKCLRedType(opts.reduce_op) + << ", sync_op: " << sync_op << ", use_calc_stream: " << false; int r = bkcl_all_reduce(comm, input.data(), @@ -580,7 +638,13 @@ std::shared_ptr ProcessGroupBKCL::Broadcast( const XPUStream& stream) { const auto root = opts.source_rank * in_tensors.size() + opts.source_root; - VLOG(3) << "bkcl_broadcast"; + VLOG(3) << "calling bkcl_broadcast" + << ", rank_id: " << platform::GetBKCLRankID(comm) + << ", dev_id: " << platform::GetBKCLDevID(comm) + << ", nranks: " << platform::GetBKCLNRanks(comm) + << ", root: " << root << ", numel: " << input.numel() + << ", dtype: " << input.type() << ", sync_op: " << true + << ", use_calc_stream: " << false; int r = bkcl_broadcast(comm, input.data(), @@ -626,7 +690,13 @@ std::shared_ptr ProcessGroupBKCL::Broadcast( const XPUStream& stream) { const auto root = opts.source_rank * in_tensors.size() + opts.source_root; - VLOG(3) << "bkcl_broadcast"; + VLOG(3) << "calling bkcl_broadcast" + << ", rank_id: " << platform::GetBKCLRankID(comm) + << ", dev_id: " << platform::GetBKCLDevID(comm) + << ", nranks: " << platform::GetBKCLNRanks(comm) + << ", root: " << root << ", numel: " << input.numel() + << ", dtype: " << input.type() << ", sync_op: " << sync_op + << ", use_calc_stream: " << false; int r = bkcl_broadcast(comm, input.data(), @@ -671,7 +741,12 @@ std::shared_ptr ProcessGroupBKCL::AllGather( const phi::DenseTensor& input, BKCLContext_t comm, const XPUStream& stream) { - VLOG(3) << "bkcl_all_gather"; + VLOG(3) << "calling bkcl_all_gather" + << ", rank_id: " << platform::GetBKCLRankID(comm) + << ", dev_id: " << platform::GetBKCLDevID(comm) + << ", nranks: " << platform::GetBKCLNRanks(comm) + << ", numel: " << input.numel() << ", dtype: " << input.type() + << ", sync_op: " << true << ", use_calc_stream: " << false; int r = bkcl_all_gather(comm, input.data(), @@ -712,7 +787,12 @@ std::shared_ptr ProcessGroupBKCL::AllGather( const phi::DenseTensor& input, BKCLContext_t comm, const XPUStream& stream) { - VLOG(3) << "bkcl_all_gather"; + VLOG(3) << "calling bkcl_all_gather" + << ", rank_id: " << platform::GetBKCLRankID(comm) + << ", dev_id: " << platform::GetBKCLDevID(comm) + << ", nranks: " << platform::GetBKCLNRanks(comm) + << ", numel: " << input.numel() << ", dtype: " << input.type() + << ", sync_op: " << sync_op << ", use_calc_stream: " << false; int r = bkcl_all_gather(comm, input.data(), diff --git a/paddle/fluid/distributed/collective/process_group_custom.cc b/paddle/fluid/distributed/collective/process_group_custom.cc index b6c7063fd6fb7..1e4d1df337bdb 100644 --- a/paddle/fluid/distributed/collective/process_group_custom.cc +++ b/paddle/fluid/distributed/collective/process_group_custom.cc @@ -125,12 +125,14 @@ void ProcessGroupCustom::BroadcastUniqueCustomID( std::vector& ccl_ids) { // NOLINT if (rank_ == 0) { for (size_t i = 0; i < ccl_ids.size(); i++) { - auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(i); + auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(gid_) + "/" + + std::to_string(i); store_->set(key, ccl_ids[i]); } } else { for (size_t i = 0; i < ccl_ids.size(); i++) { - auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(i); + auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(gid_) + "/" + + std::to_string(i); ccl_ids[i] = store_->get(key); } } diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py index ed21e1171c17c..08a5f57d293af 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py @@ -68,7 +68,6 @@ "matmul_double_grad", "tanh_double_grad", "add_double_grad", - "multiply_double_grad", "subtract_double_grad", ] diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc index 38c9ce3d8091e..693365c9f47ca 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc @@ -150,7 +150,7 @@ DeviceContext* StreamAnalyzer::ParseDeviceContext( DeviceContext* dev_ctx = nullptr; - // only gpu needs update. xpu not need, because xpu memcpy op kernel is + // only gpu need update. xpu not need, because xpu memcpy op kernel is // synchronous. if (platform::is_gpu_place(place_) || platform::is_custom_place(place_)) { VLOG(6) << "Parse DeviceContext for " << op_type diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 21547331aa08f..66658020b66c6 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -114,6 +114,25 @@ struct SimpleOpTypeSetTeller : public Teller { "atanh", "ceil", "celu", "floor", "round", "sign", "logical_not", "reciprocal", "tanh_shrink", "logsigmoid", "erf", "bitwise_not", "equal", "not_equal", "rsqrt"}; + + // Static shape does not support 0 or 1 dim's input. + if (!with_dynamic_shape) { + auto inputs = desc.Inputs(); + for (auto iter : inputs) { + for (auto var_name : iter.second) { + auto* block = desc.Block(); + if (block) { + auto* var_desc = block->FindVar(var_name); + // Can't get feed op's TensorDesc + if (op_type != "feed" && var_desc && !var_desc->Persistable()) { + const auto shape = var_desc->GetShape(); + if (shape.size() == 1 || shape.size() == 0) return false; + } + } + } + } + } + if (act_op_list.find(op_type) != act_op_list.end()) { auto* block = desc.Block(); if (block == nullptr) { @@ -122,15 +141,6 @@ struct SimpleOpTypeSetTeller : public Teller { "the pass."; return false; } - auto x_var_name = desc.Input("X")[0]; - auto* x_var_desc = block->FindVar(x_var_name); - const auto x_shape = x_var_desc->GetShape(); - if (!with_dynamic_shape && (x_shape.size() == 1 || x_shape.size() == 0)) { - VLOG(3) << op_type - << " op does not support input's dim is 1 or 0 in tensorrt " - "static shape mode."; - return false; - } #if !IS_TRT_VERSION_GE(7000) if (op_type == "erf") { VLOG(3) << op_type << " op does not support tensorrt."; @@ -138,6 +148,9 @@ struct SimpleOpTypeSetTeller : public Teller { } #endif #if !IS_TRT_VERSION_GE(8600) + auto x_var_name = desc.Input("X")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); if (x_shape.size() == 0 && unary_list.find(op_type) != unary_list.end()) { VLOG(3) << op_type << " op does not support 0 dim input when TensorRT < 8.6."; @@ -145,24 +158,6 @@ struct SimpleOpTypeSetTeller : public Teller { } #endif } - // In static shape in Paddle-TRT, we can't allow that one op has a - // 1D intermediate tensor as input. - if (!with_dynamic_shape) { - auto inputs = desc.Inputs(); - for (auto iter : inputs) { - for (auto var_name : iter.second) { - auto* block = desc.Block(); - if (block) { - auto* var_desc = block->FindVar(var_name); - // Can't get feed op's TensorDesc - if (op_type != "feed" && var_desc && !var_desc->Persistable()) { - const auto shape = var_desc->GetShape(); - if (shape.size() == 1) return false; - } - } - } - } - } if (op_type == "dropout") { /* @@ -1491,6 +1486,7 @@ struct SimpleOpTypeSetTeller : public Teller { "elementwise op."; return false; } + if (x_var_desc->Persistable() && !with_dynamic_shape) { VLOG(3) << "Input X is a parameter which is not supported for " @@ -1864,8 +1860,10 @@ struct SimpleOpTypeSetTeller : public Teller { auto x_var_name = desc.Input("X")[0]; auto* x_var_desc = block->FindVar(x_var_name); const auto x_shape = x_var_desc->GetShape(); - if (x_shape.size() == 1) { - VLOG(3) << "mish op does not support input's dim is 1 in tensorrt."; + if ((!with_dynamic_shape && x_shape.size() == 1) || x_shape.size() == 0) { + VLOG(3) << op_type + << "mish op does not support input's dim is 1 in tensorrt " + "static shape mode or 0."; return false; } } @@ -2598,6 +2596,15 @@ struct SimpleOpTypeSetTeller : public Teller { "the pass."; return false; } + +#if IS_TRT_VERSION_LT(8000) + auto x_var_name = desc.Input("X")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); + if (x_shape.size() == 0) { + return false; // not supported 0 dim. + } +#endif } if (op_type == "grid_sampler") { diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 527e843d05bb8..931372a0d9a43 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -1039,8 +1039,8 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, #elif defined(PADDLE_WITH_XPU) return GetAllocator(place)->Allocate(size); #else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Not compiled with GPU or XPU or NPU.")); + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with GPU or XPU.")); #endif } diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h index a6d807b028c1b..07a1c46ac0923 100644 --- a/paddle/fluid/operators/beam_search_decode_op.h +++ b/paddle/fluid/operators/beam_search_decode_op.h @@ -95,7 +95,7 @@ struct BeamSearchDecodeFunctor { } else { BeamSearchDecoder beam_search_decoder(beam_size_, end_id_); - // Check if the tensor is on GPU or NPU. If so, use the CPU copy instead + // Check if the tensor is on GPU. If so, use the CPU copy instead if (tensor_on_gpu_ || tensor_on_npu_) { beam_search_decoder.Backtrace( step_ids_, step_scores_, id_tensor_, score_tensor_); diff --git a/paddle/fluid/operators/collective/c_embedding_op.cc b/paddle/fluid/operators/collective/c_embedding_op.cc index aee2b0c86b81e..2efd5b46bdc09 100644 --- a/paddle/fluid/operators/collective/c_embedding_op.cc +++ b/paddle/fluid/operators/collective/c_embedding_op.cc @@ -79,7 +79,7 @@ class CEmbeddingOpMaker : public framework::OpProtoAndCheckerMaker { "(Tensor) The input represents embedding tensors, " "which is a learnable parameter."); AddInput("Ids", - "An input with type int32 or int64 in CPU and GPU, int32 in NPU " + "An input with type int32 or int64 in CPU and GPU, " "contains the ids to be looked up in W."); AddOutput("Out", "The lookup results, which have the same type as W."); diff --git a/paddle/fluid/operators/collective/global_gather_op.h b/paddle/fluid/operators/collective/global_gather_op.h index 0d3b4ed92e9b2..723c5e48a5ae4 100644 --- a/paddle/fluid/operators/collective/global_gather_op.h +++ b/paddle/fluid/operators/collective/global_gather_op.h @@ -28,7 +28,7 @@ namespace operators { template class GlobalGatherOpCPUKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext& ctx UNUSED) const override { PADDLE_THROW(platform::errors::Unavailable( "Do not support global gather op for cpu kernel now.")); } diff --git a/paddle/fluid/operators/collective/global_scatter_op.h b/paddle/fluid/operators/collective/global_scatter_op.h index 3cb2a3c7fc41b..fc4b48500c071 100644 --- a/paddle/fluid/operators/collective/global_scatter_op.h +++ b/paddle/fluid/operators/collective/global_scatter_op.h @@ -28,7 +28,7 @@ namespace operators { template class GlobalScatterOpCPUKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext& ctx UNUSED) const override { PADDLE_THROW(platform::errors::Unavailable( "Do not support global scatter op for cpu kernel now.")); } diff --git a/paddle/fluid/operators/collective/partial_allgather_op.h b/paddle/fluid/operators/collective/partial_allgather_op.h index 6b827a2656f29..815558d0227eb 100644 --- a/paddle/fluid/operators/collective/partial_allgather_op.h +++ b/paddle/fluid/operators/collective/partial_allgather_op.h @@ -29,7 +29,7 @@ namespace operators { template class PartialAllGatherOpCPUKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext& ctx UNUSED) const override { PADDLE_THROW(platform::errors::Unavailable( "Do not support partial_allgather for cpu kernel now.")); } diff --git a/paddle/fluid/operators/collective/partial_recv_op.h b/paddle/fluid/operators/collective/partial_recv_op.h index fdf3f02b0d679..baf47ef9dff8d 100644 --- a/paddle/fluid/operators/collective/partial_recv_op.h +++ b/paddle/fluid/operators/collective/partial_recv_op.h @@ -27,7 +27,7 @@ namespace operators { template class PartialRecvOpCPUKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext& ctx UNUSED) const override { PADDLE_THROW(platform::errors::Unavailable( "Do not support partial_recv for cpu kernel now.")); } diff --git a/paddle/fluid/operators/collective/partial_send_op.h b/paddle/fluid/operators/collective/partial_send_op.h index 773125be7d40f..b7b72789b87ff 100644 --- a/paddle/fluid/operators/collective/partial_send_op.h +++ b/paddle/fluid/operators/collective/partial_send_op.h @@ -28,7 +28,7 @@ namespace operators { template class PartialSendOpCPUKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext& ctx UNUSED) const override { PADDLE_THROW(platform::errors::Unavailable( "Do not support partial_send for cpu kernel now.")); } diff --git a/paddle/fluid/operators/collective/recv_v2_op.h b/paddle/fluid/operators/collective/recv_v2_op.h index 3430cdb73aa1a..e76e4a7b55197 100644 --- a/paddle/fluid/operators/collective/recv_v2_op.h +++ b/paddle/fluid/operators/collective/recv_v2_op.h @@ -27,7 +27,7 @@ namespace operators { template class RecvOpV2CPUKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext& ctx UNUSED) const override { PADDLE_THROW(platform::errors::Unavailable( "Do not support recv for cpu kernel now.")); } diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index 7d1ae606d1710..7db85eebb4f84 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -27,11 +27,6 @@ cc_library( SRCS while_op_helper.cc DEPS operator op_variant) -cc_test( - conditional_block_op_test - SRCS conditional_block_op_test.cc - DEPS conditional_block_op standalone_executor executor) - if(WITH_UNITY_BUILD) target_link_libraries(paddle_operators_controlflow_unity conditional_block_op) else() diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 3e07b1f155452..1bca2068f83d8 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -91,9 +91,5 @@ cc_library( mask_util SRCS mask_util.cc DEPS memory) -cc_test( - mask_util_test - SRCS mask_util_test.cc - DEPS memory mask_util) detection_library(generate_mask_labels_op SRCS generate_mask_labels_op.cc DEPS mask_util) diff --git a/paddle/fluid/operators/dlnne/CMakeLists.txt b/paddle/fluid/operators/dlnne/CMakeLists.txt index 6680e1f23e08d..de017546fb34e 100644 --- a/paddle/fluid/operators/dlnne/CMakeLists.txt +++ b/paddle/fluid/operators/dlnne/CMakeLists.txt @@ -45,8 +45,3 @@ op_library( #endif() target_link_libraries(dlnne_engine_op ${DLNNE_LIB} ${CURT_LIB}) - -cc_test( - test_dlnne_engine_op - SRCS dlnne_engine_op_test.cc - DEPS dlnne_engine_op analysis) diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc index 6594df2f5164f..58d81ebf8be06 100644 --- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc @@ -66,7 +66,6 @@ class FusedGemmEpilogueXPUKernel : public framework::OpKernel { phi::XpuFcInfo fc_info; phi::GetFCInfo(x_mat_dims, y->dims(), trans_x, trans_y, &fc_info); - VLOG(0) << "FusedGemmEpilogueXPUKernel 000"; xpu::Context* xpu_ctx = dev_ctx.x_context(); const XPUType* x_ptr = reinterpret_cast(x->data()); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index 5cea4fa9e0573..40c82619db4a3 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -622,13 +622,12 @@ class ReduceBaseOp : public framework::OperatorWithKernel { // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_MKLDNN if (input_data_type == framework::proto::VarType::FP16) { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()) || - platform::is_xpu_place(ctx.GetPlace()) || - platform::is_custom_place(ctx.GetPlace()), - true, - platform::errors::InvalidArgument( - "float16 can only be used on GPU or NPU or XPU place")); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()) || + platform::is_xpu_place(ctx.GetPlace()) || + platform::is_custom_place(ctx.GetPlace()), + true, + platform::errors::InvalidArgument( + "float16 can only be used on GPU or XPU place")); } return phi::KernelKey(input_data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 633ef748be698..2fb7883cb3f71 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -47,7 +47,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { platform::is_custom_place(ctx.GetPlace()), true, platform::errors::InvalidArgument( - "float16 can only be used on GPU/NPU/XPU and custom place")); + "float16 can only be used on GPU/XPU and custom place")); } return phi::KernelKey( ctx.GetPlace(), layout_, phi::TransToPhiDataType(input_data_type)); @@ -130,7 +130,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { platform::is_xpu_place(ctx.GetPlace()) || platform::is_custom_place(ctx.GetPlace()))) PADDLE_THROW(platform::errors::InvalidArgument( - "float16 can only be used on GPU/NPU/XPU and custom place")); + "float16 can only be used on GPU/XPU and custom place")); } return phi::KernelKey( ctx.GetPlace(), layout_, phi::TransToPhiDataType(input_data_type)); diff --git a/paddle/fluid/platform/device/xpu/bkcl_helper.h b/paddle/fluid/platform/device/xpu/bkcl_helper.h index 1f44bc0a8c98b..dcb17bfd2b932 100644 --- a/paddle/fluid/platform/device/xpu/bkcl_helper.h +++ b/paddle/fluid/platform/device/xpu/bkcl_helper.h @@ -62,6 +62,18 @@ inline BKCLDataType ToBKCLDataType(framework::proto::VarType::Type type) { } } +inline int GetBKCLRankID(BKCLContext_t comm) { + return reinterpret_cast(comm)[0]; +} + +inline int GetBKCLDevID(BKCLContext_t comm) { + return reinterpret_cast(comm)[1]; +} + +inline int GetBKCLNRanks(BKCLContext_t comm) { + return reinterpret_cast(comm)[2]; +} + class BKCLGroupGuard { public: static std::mutex &BKCLMutex() { diff --git a/paddle/fluid/platform/device_event_base.h b/paddle/fluid/platform/device_event_base.h index f56688de09a32..e2de1e5a9abe3 100644 --- a/paddle/fluid/platform/device_event_base.h +++ b/paddle/fluid/platform/device_event_base.h @@ -65,7 +65,7 @@ class DeviceEvent { MaxDeviceTypes, type_id_)); #ifndef PADDLE_WITH_CUSTOM_DEVICE - // TODO(Aurelius84): only support CPU/CUDA/NPU. + // TODO(Aurelius84): only support CPU/CUDA. PADDLE_ENFORCE_LT(type_id_, 3, platform::errors::Unavailable( diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 2caf978db6f23..7f32c14c493af 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -931,7 +931,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self, // with FLAGS_set_to_1d=True. In this case, one `None` should be pop out, // otherwise the output shape will be not correct. if (static_cast(decrease_axis.size()) == tensor->dims().size()) { - VLOG(0) + VLOG(1) << "Warning: In Tensor '__getitem__', if the number of scalar " "elements " "in the index is equal to the rank of the Tensor, the output " diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index aa1ee4724925e..65eac1e3dc6fd 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -1060,7 +1060,7 @@ void BindImperative(py::module *m_ptr) { // not correct. if (static_cast(decrease_axis.size()) == tensor->dims().size()) { - VLOG(0) << "Warning: In Tensor '__getitem__', if the number " + VLOG(1) << "Warning: In Tensor '__getitem__', if the number " "of scalar " "elements " "in the index is equal to the rank of the Tensor, " diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc index 553701906f59e..3f20a2498f840 100644 --- a/paddle/fluid/pybind/parallel_executor.cc +++ b/paddle/fluid/pybind/parallel_executor.cc @@ -188,39 +188,7 @@ using namespace paddle::framework; // NOLINT void BindParallelExecutor(pybind11::module &m) { // NOLINT // -- python binds for parallel executor. py::class_ pe(m, "ParallelExecutor"); - py::class_ exec_strategy(pe, "ExecutionStrategy", R"DOC( - ExecutionStrategy allows the user to more preciously control how to run - the program in ParallelExecutor by setting the property. - - Returns: - ExecutionStrategy: An ExecutionStrategy object. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - import paddle.nn.functional as F - - paddle.enable_static() - - x = static.data(name='x', shape=[None, 13], dtype='float32') - y = static.data(name='y', shape=[None, 1], dtype='float32') - y_predict = static.nn.fc(input=x, size=1, act=None) - - cost = F.square_error_cost(input=y_predict, label=y) - avg_loss = paddle.mean(cost) - - sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) - sgd_optimizer.minimize(avg_loss) - - exec_strategy = static.ExecutionStrategy() - exec_strategy.num_threads = 4 - - train_exe = static.ParallelExecutor(use_cuda=False, - loss_name=avg_loss.name, - exec_strategy=exec_strategy) - )DOC"); + py::class_ exec_strategy(pe, "ExecutionStrategy"); py::enum_(m, "DeviceType", py::arithmetic()) .value("CPU", paddle::platform::DeviceType::CPU) @@ -233,29 +201,7 @@ void BindParallelExecutor(pybind11::module &m) { // NOLINT [](const ExecutionStrategy &self) { return self.num_threads_; }, [](ExecutionStrategy &self, size_t num_threads) { self.num_threads_ = num_threads; - }, - R"DOC( - The type is INT, num_threads represents the size of thread pool that - used to run the operators of the current program in ParallelExecutor. - If :math:`num\_threads=1`, all the operators will execute one by one, - but the order maybe difference between iterations. - If it is not set, it will be set in ParallelExecutor according to the - device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU, - :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor. - if it is not set, ParallelExecutor will get the cpu count by calling - `multiprocessing.cpu_count()`. Default 0. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - exec_strategy = static.ExecutionStrategy() - exec_strategy.num_threads = 4 - )DOC") + }) .def_property( "_use_device", [](const ExecutionStrategy &self) { return self.use_device_; }, @@ -268,11 +214,7 @@ void BindParallelExecutor(pybind11::module &m) { // NOLINT [](const ExecutionStrategy &self) { return self.allow_op_delay_; }, [](ExecutionStrategy &self, bool allow_op_delay) { self.allow_op_delay_ = allow_op_delay; - }, - R"DOC(The type is BOOL, allow_op_delay represents whether to delay the - communication operators to run, it may make the execution faster. - Note that this option is invalid now, and it will be removed in - next version. Default False.)DOC") + }) .def_property( "num_iteration_per_drop_scope", [](const ExecutionStrategy &self) { @@ -280,30 +222,7 @@ void BindParallelExecutor(pybind11::module &m) { // NOLINT }, [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) { self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope; - }, - R"DOC(The type is INT, num_iteration_per_drop_scope indicates how - many iterations to clean up the temp variables which - is generated during execution. It may make the execution faster, - because the temp variable's shape maybe the same between two iterations. - Default 100. - - .. note:: - 1. If you fetch data when calling the 'run', the ParallelExecutor - will clean up the temp variables at the end of the current iteration. - 2. In some NLP model, it may cause the GPU memory is insufficient, - in this case, you should reduce `num_iteration_per_drop_scope`. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - exec_strategy = static.ExecutionStrategy() - exec_strategy.num_iteration_per_drop_scope = 10 - )DOC") + }) .def_property( "num_iteration_per_run", [](const ExecutionStrategy &self) { @@ -311,29 +230,13 @@ void BindParallelExecutor(pybind11::module &m) { // NOLINT }, [](ExecutionStrategy &self, size_t num_iteration_per_run) { self.num_iteration_per_run_ = num_iteration_per_run; - }, - R"DOC(This config that how many iteration the executor will run when - user call exe.run() in python。Default: 1. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - exec_strategy = static.ExecutionStrategy() - exec_strategy.num_iteration_per_run = 10 - )DOC") + }) .def_property( "use_thread_barrier", [](const ExecutionStrategy &self) { return self.thread_barrier_; }, [](ExecutionStrategy &self, bool use_thread_barrier) { self.thread_barrier_ = use_thread_barrier; - }, - R"DOC(This config that the this is distributed training with parameter server - )DOC") + }) .def_property( "_dry_run", [](const ExecutionStrategy &self) { return self.dry_run_; }, diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 65132bc68fa0d..6f8ae115bd12a 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -434,7 +434,7 @@ void SetTensorFromPyArrayT( } #else PADDLE_THROW(platform::errors::PermissionDenied( - "Cannot use IPUPlace in CPU/GPU/XPU/NPU version, " + "Cannot use IPUPlace in CPU/GPU/XPU version, " "Please recompile or reinstall Paddle with IPU support.")); #endif } else if (paddle::platform::is_custom_place(place)) { @@ -1106,7 +1106,7 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor, return py_arr; #else PADDLE_THROW(platform::errors::PermissionDenied( - "Cannot use CustomPlace in CPU/GPU/XPU/NPU version, " + "Cannot use CustomPlace in CPU/GPU/XPU version, " "Please recompile or reinstall Paddle with CustomPlace " "support.")); #endif diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 9d1b2ce5b4933..3eaafe2b407ad 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -617,6 +617,7 @@ func : multiply_double_grad optional : grad_x_grad, grad_y_grad inplace : (grad_x_grad -> grad_out_grad) + backward : multiply_triple_grad composite : multiply_double_grad(x, y, grad_out, grad_x_grad, grad_y_grad, axis, x_grad, y_grad, grad_out_grad) - backward_op : multiply_grad @@ -631,6 +632,17 @@ composite: multiply_grad(x, y, out_grad, axis, x_grad, y_grad) backward : multiply_double_grad +- backward_op : multiply_triple_grad + forward : multiply_double_grad (Tensor x, Tensor y, Tensor fwd_grad_out, Tensor fwd_grad_grad_x, Tensor fwd_grad_grad_y, int aixs = -1) -> Tensor(grad_x), Tensor(grad_y), Tensor(grad_grad_out) + args : (Tensor x, Tensor y, Tensor fwd_grad_out, Tensor fwd_grad_grad_x, Tensor fwd_grad_grad_y, Tensor grad_x_grad, Tensor grad_y_grad, Tensor grad_grad_out_grad, int axis = -1) + output : Tensor(x_grad), Tensor(y_grad), Tensor(fwd_grad_out_grad), Tensor(fwd_grad_grad_x_grad), Tensor(fwd_grad_grad_y_grad) + infer_meta : + func : GeneralQuinaryGradInferMeta + param : [x, y, fwd_grad_out, fwd_grad_grad_x, fwd_grad_grad_y] + kernel : + func : multiply_triple_grad + optional : fwd_grad_grad_x, fwd_grad_grad_y, grad_x_grad, grad_y_grad, grad_grad_out_grad + - backward_op : norm_grad forward : norm (Tensor x, int axis, float epsilon, bool is_test) -> Tensor(out), Tensor(norm) args : (Tensor x, Tensor norm, Tensor out_grad, int axis, float epsilon, bool is_test) diff --git a/paddle/phi/backends/device_memory_aligment.h b/paddle/phi/backends/device_memory_aligment.h index 72562f0c001f0..06d9b450a83ab 100644 --- a/paddle/phi/backends/device_memory_aligment.h +++ b/paddle/phi/backends/device_memory_aligment.h @@ -41,7 +41,7 @@ inline size_t Alignment(size_t size, alignment = alignment; #else PADDLE_THROW(phi::errors::PreconditionNotMet( - "Fluid is not compiled with CUDA/XPU/NPU.")); + "Fluid is not compiled with CUDA/XPU.")); #endif } } diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index 4c3688b0badfa..6f10baf07bb5a 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -185,6 +185,9 @@ XPUOpMap& get_kl2_ops() { {"deformable_conv_v1", XPUKernelSet({phi::DataType::FLOAT32})}, {"depthwise_conv2d_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"depthwise_conv2d", XPUKernelSet({phi::DataType::FLOAT32})}, + {"depthwise_conv2d_transpose_grad", + XPUKernelSet({phi::DataType::FLOAT32})}, + {"depthwise_conv2d_transpose", XPUKernelSet({phi::DataType::FLOAT32})}, {"diag_v2", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16, @@ -531,6 +534,16 @@ XPUOpMap& get_kl2_ops() { {"p_norm_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"pad3d_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"pad3d", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, + {"pad", + XPUKernelSet({phi::DataType::FLOAT32, + phi::DataType::INT32, + phi::DataType::INT16, + phi::DataType::FLOAT16})}, + {"pad_grad", + XPUKernelSet({phi::DataType::FLOAT32, + phi::DataType::INT32, + phi::DataType::INT16, + phi::DataType::FLOAT16})}, {"pixel_shuffle", XPUKernelSet({phi::DataType::FLOAT32})}, {"pixel_shuffle_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"pool2d_grad", diff --git a/paddle/phi/kernels/impl/eigvalsh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigvalsh_grad_kernel_impl.h index 7248985bf294c..18a6a0518a3cc 100644 --- a/paddle/phi/kernels/impl/eigvalsh_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/eigvalsh_grad_kernel_impl.h @@ -27,8 +27,8 @@ template void EigvalshGradKernel(const Context& dev_ctx, const DenseTensor& out_v, const DenseTensor& out_w_grad, - const std::string& uplo, - bool is_test, + const std::string& uplo UNUSED, + bool is_test UNUSED, DenseTensor* x_grad) { auto tV = phi::TransposeLast2Dim(dev_ctx, phi::Conj(dev_ctx, out_v)); diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h index 400334ad4e04e..92a4f99c6eb77 100644 --- a/paddle/phi/kernels/impl/einsum_impl.h +++ b/paddle/phi/kernels/impl/einsum_impl.h @@ -752,7 +752,7 @@ void EinsumKernel(const Context& dev_ctx, const std::string& equation, DenseTensor* out, std::vector cache, - std::vector xshape) { + std::vector xshape UNUSED) { std::vector tmp; // for the sake of compatibility, we may load and run v2.3 EinsumOp. Output // may have nullptr and the cache.size() is not equal to inputs.size(). refer diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index 15f99a58fa5a5..3ce1e721b968e 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -119,7 +119,9 @@ void SubtractDoubleGradImpl(const Context& dev_ctx, template struct DivGradDX { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; } + HOSTDEVICE T operator()(T x UNUSED, T y, T out UNUSED, T dout) const { + return dout / y; + } }; template @@ -136,7 +138,7 @@ struct DivGradDX> { template struct DivGradDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + HOSTDEVICE T operator()(T x UNUSED, T y, T out, T dout) const { return -dout * out / y; } }; @@ -857,14 +859,14 @@ struct MinGradDy { template struct HeavisideGradDx { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + HOSTDEVICE T operator()(T x UNUSED, T y UNUSED, T out UNUSED, T dout) const { return dout * static_cast(0); } }; template struct HeavisideGradDy { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + HOSTDEVICE T operator()(T x, T y UNUSED, T out UNUSED, T dout) const { return dout * static_cast(x == static_cast(0)); } }; diff --git a/paddle/phi/kernels/impl/lamb_kernel_impl.h b/paddle/phi/kernels/impl/lamb_kernel_impl.h index e0850b8aef0d9..b02d2a517a1c5 100644 --- a/paddle/phi/kernels/impl/lamb_kernel_impl.h +++ b/paddle/phi/kernels/impl/lamb_kernel_impl.h @@ -128,7 +128,7 @@ void ComputeImpl(const Context& dev_ctx, float beta1_f, float beta2_f, float epsilon_f, - bool multi_precision, + bool multi_precision UNUSED, DenseTensor* param_out, DenseTensor* mom1_out, DenseTensor* mom2_out, diff --git a/paddle/phi/kernels/impl/lu_kernel_impl.h b/paddle/phi/kernels/impl/lu_kernel_impl.h index 5663484362a8e..e9ba46d0c162c 100644 --- a/paddle/phi/kernels/impl/lu_kernel_impl.h +++ b/paddle/phi/kernels/impl/lu_kernel_impl.h @@ -474,7 +474,7 @@ void Unpack_Pivot(const Context& dev_ctx, const DenseTensor& Pivot, DenseTensor* P, int h, - int w) { + int w UNUSED) { auto dims = Pivot.dims(); auto Pdimvec = vectorize(dims); auto prank = Pdimvec.size(); diff --git a/paddle/phi/kernels/impl/unstack_kernel_impl.h b/paddle/phi/kernels/impl/unstack_kernel_impl.h index 030f4a62c6e00..102126a1e3307 100644 --- a/paddle/phi/kernels/impl/unstack_kernel_impl.h +++ b/paddle/phi/kernels/impl/unstack_kernel_impl.h @@ -26,7 +26,7 @@ template void UnStackKernel(const Context &dev_ctx, const DenseTensor &x, int axis, - int num, + int num UNUSED, std::vector outs) { auto *dy = &x; auto dx = outs; diff --git a/paddle/phi/kernels/onednn/conv_handler.h b/paddle/phi/kernels/onednn/conv_handler.h index 2be0ba5649711..fd6cf0c577849 100644 --- a/paddle/phi/kernels/onednn/conv_handler.h +++ b/paddle/phi/kernels/onednn/conv_handler.h @@ -240,10 +240,10 @@ class ConvOneDNNHandlerT const std::string& padding_algorithm, const std::vector& dilations_in, int groups, - const std::string& data_format, + const std::string& data_format UNUSED, bool is_test, - phi::DenseTensor* filter_grad, - phi::DenseTensor* in_x_grad, + phi::DenseTensor* filter_grad UNUSED, + phi::DenseTensor* in_x_grad UNUSED, const std::string& unique_name) : funcs::OneDNNHandlerT& paddings, bool ceil_mode, bool exclusive, - const std::string& data_format, + const std::string& data_format UNUSED, const std::string& pooling_type, bool global_pooling, bool adaptive, diff --git a/paddle/phi/kernels/onednn/reduce_kernel_impl.h b/paddle/phi/kernels/onednn/reduce_kernel_impl.h index 69f667c36624b..7c512c6e3eb4e 100644 --- a/paddle/phi/kernels/onednn/reduce_kernel_impl.h +++ b/paddle/phi/kernels/onednn/reduce_kernel_impl.h @@ -118,7 +118,7 @@ void ReduceGradKernel(const Context& dev_ctx, bool reduce_all, DenseTensor* x_grad, dnnl::algorithm binary_type, - dnnl::algorithm reduction_type, + dnnl::algorithm reduction_type UNUSED, float scale_x, float scale_y) { reduce_all = recompute_reduce_all(x, dims, reduce_all); diff --git a/paddle/phi/kernels/onednn/slice_kernel.cc b/paddle/phi/kernels/onednn/slice_kernel.cc index 6c927018264bf..dfda78de77c06 100644 --- a/paddle/phi/kernels/onednn/slice_kernel.cc +++ b/paddle/phi/kernels/onednn/slice_kernel.cc @@ -25,7 +25,7 @@ void SliceKernel(const Context& dev_ctx, const std::vector& axes, const IntArray& starts, const IntArray& ends, - const std::vector& infer_flags, + const std::vector& infer_flags UNUSED, const std::vector& decrease_axis, DenseTensor* out) { const auto& onednn_engine = dev_ctx.GetEngine(); diff --git a/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h b/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h index 4323a23e0e60c..f9c620c925689 100644 --- a/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h +++ b/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h @@ -131,7 +131,7 @@ void ComputeRowImpl(const Context& dev_ctx, float beta1_f, float beta2_f, float epsilon_f, - bool multi_precision, + bool multi_precision UNUSED, DenseTensor* param_out, DenseTensor* mom1_out, DenseTensor* mom2_out, diff --git a/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc index fe8b8ad6f8166..09c307a5c005f 100644 --- a/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc @@ -38,10 +38,10 @@ void Conv3dCooGradCPUKernel(const CPUContext& dev_ctx, const DenseTensor& rulebook, const DenseTensor& counter, const SparseCooTensor& out_grad, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, + const std::vector& paddings UNUSED, + const std::vector& dilations UNUSED, + const std::vector& strides UNUSED, + const int groups UNUSED, const bool subm, const std::string& key, SparseCooTensor* x_grad, diff --git a/paddle/phi/kernels/sparse/cpu/conv_kernel.cc b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc index 48f04ad1ddfa5..7fcbb5cfdd1f2 100644 --- a/paddle/phi/kernels/sparse/cpu/conv_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc @@ -34,7 +34,7 @@ void Conv3dCooCPUKernel(const CPUContext& dev_ctx, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, - const int groups, + const int groups UNUSED, const bool subm, const std::string& key, SparseCooTensor* out, diff --git a/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc b/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc index 9db36ace02e4d..4b61b61b5e254 100644 --- a/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc @@ -97,6 +97,36 @@ void Conv2dTransposeGradKernel(const Context& ctx, PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_transpose_grad"); } +template +void DepthwiseConv2dTransposeGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const IntArray& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* dx, + DenseTensor* dfilter) { + Conv2dTransposeGradKernel(ctx, + x, + filter, + dout, + strides, + paddings, + output_padding, + output_size, + padding_algorithm, + groups, + dilations, + data_format, + dx, + dfilter); +} } // namespace phi PD_REGISTER_KERNEL(conv2d_transpose_grad, @@ -104,3 +134,8 @@ PD_REGISTER_KERNEL(conv2d_transpose_grad, ALL_LAYOUT, phi::Conv2dTransposeGradKernel, float) {} +PD_REGISTER_KERNEL(depthwise_conv2d_transpose_grad, + XPU, + ALL_LAYOUT, + phi::DepthwiseConv2dTransposeGradKernel, + float) {} diff --git a/paddle/phi/kernels/xpu/conv_transpose_kernel.cc b/paddle/phi/kernels/xpu/conv_transpose_kernel.cc index 1b3c31f665c7c..f658f06a9908d 100644 --- a/paddle/phi/kernels/xpu/conv_transpose_kernel.cc +++ b/paddle/phi/kernels/xpu/conv_transpose_kernel.cc @@ -145,8 +145,39 @@ void Conv2dTransposeKernel(const Context& ctx, PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_transpose_v2"); } } +template +void DepthwiseConv2dTransposeKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const IntArray& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* out) { + Conv2dTransposeKernel(ctx, + x, + filter, + strides, + paddings, + output_padding, + output_size, + padding_algorithm, + groups, + dilations, + data_format, + out); +} } // namespace phi +PD_REGISTER_KERNEL(depthwise_conv2d_transpose, + XPU, + ALL_LAYOUT, + phi::DepthwiseConv2dTransposeKernel, + float) {} PD_REGISTER_KERNEL(conv2d_transpose, XPU, diff --git a/paddle/phi/kernels/xpu/pad_grad_kernel.cc b/paddle/phi/kernels/xpu/pad_grad_kernel.cc new file mode 100644 index 0000000000000..45fc3393412cd --- /dev/null +++ b/paddle/phi/kernels/xpu/pad_grad_kernel.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pad_grad_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +template +void PadGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const std::vector& paddings, + const Scalar& pad_value, + DenseTensor* d_x) { + using XPUType = typename XPUTypeTrait::Type; + std::vector pad_left, pad_right; + std::vector out_shape = vectorize(d_out.dims()); + dev_ctx.template Alloc(d_x); + + for (size_t i = 0; i < paddings.size() / 2; ++i) { + pad_left.push_back(-paddings[i * 2]); + pad_right.push_back(-paddings[i * 2 + 1]); + } + + XPUType value = static_cast(pad_value.to()); + int r = xpu::pad(dev_ctx.x_context(), + reinterpret_cast(d_out.data()), + reinterpret_cast(d_x->data()), + out_shape, + pad_left, + pad_right, + value); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "pad"); +} +} // namespace phi + +PD_REGISTER_KERNEL(pad_grad, + XPU, + ALL_LAYOUT, + phi::PadGradKernel, + float, + int, + int16_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/xpu/pad_kernel.cc b/paddle/phi/kernels/xpu/pad_kernel.cc new file mode 100644 index 0000000000000..899503e328607 --- /dev/null +++ b/paddle/phi/kernels/xpu/pad_kernel.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pad_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +template +void PadKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& paddings, + const Scalar& pad_value, + DenseTensor* out) { + using XPUType = typename XPUTypeTrait::Type; + dev_ctx.template Alloc(out); + std::vector pad_left, pad_right; + std::vector xshape = vectorize(x.dims()); + + for (size_t i = 0; i < paddings.size() / 2; ++i) { + pad_left.push_back(paddings[i * 2]); + pad_right.push_back(paddings[i * 2 + 1]); + } + + XPUType value = static_cast(pad_value.to()); + int r = xpu::pad(dev_ctx.x_context(), + reinterpret_cast(x.data()), + reinterpret_cast(out->data()), + xshape, + pad_left, + pad_right, + value); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "pad"); +} +} // namespace phi + +PD_REGISTER_KERNEL(pad, + XPU, + ALL_LAYOUT, + phi::PadKernel, + float, + int, + int16_t, + phi::dtype::float16) {} diff --git a/python/paddle/distributed/auto_parallel/dist_loader.py b/python/paddle/distributed/auto_parallel/dist_loader.py index 5af0dd12f3ff9..5c0cd89306c90 100644 --- a/python/paddle/distributed/auto_parallel/dist_loader.py +++ b/python/paddle/distributed/auto_parallel/dist_loader.py @@ -17,16 +17,16 @@ import numpy as np import paddle -from paddle.fluid.dataloader.batch_sampler import ( +from paddle.io import BatchSampler, IterableDataset +from paddle.io.dataloader.batch_sampler import ( DistributedBatchSampler, _InfiniteIterableSampler, ) -from paddle.fluid.dataloader.dataloader_iter import ( +from paddle.io.dataloader.dataloader_iter import ( _DatasetKind, default_collate_fn, default_convert_fn, ) -from paddle.io import BatchSampler, IterableDataset class DistributedDataLoaderBase(metaclass=abc.ABCMeta): @@ -272,7 +272,7 @@ def __next__(self): return next(self.data) def _create_inner_dataloader(self): - dataloader = paddle.fluid.io.DataLoader( + dataloader = paddle.io.DataLoader( self.dataset, feed_list=self.feed_list, places=self.places, diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py index da20e312a1dc8..fa4b937ba56b7 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import numpy as np @@ -27,10 +26,30 @@ _use_cache = False _enable_partial_send_recv = True +_xpu_comm_group_started = False + _sync_send = os.environ.get("PADDLE_P2P_SYNC_SEND", "0") _sync_send = _sync_send.lower() in ['1', 'true'] +def _xpu_comm_group_start(): + if not paddle.is_compiled_with_xpu(): + return + global _xpu_comm_group_started + assert not _xpu_comm_group_started + framework.core.ProcessGroupBKCL.group_start() + _xpu_comm_group_started = True + + +def _xpu_comm_group_end(): + if not paddle.is_compiled_with_xpu(): + return + global _xpu_comm_group_started + if _xpu_comm_group_started: + framework.core.ProcessGroupBKCL.group_end() + _xpu_comm_group_started = False + + def initialize_p2p_groups(hcg, use_cache=True, enable_partial_send_recv=True): global _hcg, _use_cache, _enable_partial_send_recv _hcg = hcg @@ -357,6 +376,7 @@ def _p2p_helper( # TODO(Yuang Liu): use batch_isend_irecv replace all these comm ops tasks = [] # start to p2p communicate + if _sync_send: # Some devices(NPU for example) do not support asynchronized send op, So the order is # recv_prev -> send_next -> recv_next -> send_prev @@ -492,8 +512,8 @@ def _p2p_helper( group=_hcg.send_prev_group, use_calc_stream=False, ) - else: + _xpu_comm_group_start() if tensor_send_prev is not None: if isinstance(tensor_send_prev, tuple): for d in tensor_send_prev: @@ -529,6 +549,7 @@ def _p2p_helper( use_calc_stream=sync_recv, ) if sync_recv: + _xpu_comm_group_end() allgather_partial( d, nranks=mp_degree, @@ -549,6 +570,7 @@ def _p2p_helper( ) if sync_recv: + _xpu_comm_group_end() allgather_partial( tensor_recv_prev, nranks=mp_degree, @@ -595,6 +617,7 @@ def _p2p_helper( ) if sync_recv: + _xpu_comm_group_end() allgather_partial( d, nranks=mp_degree, @@ -615,6 +638,7 @@ def _p2p_helper( use_calc_stream=sync_recv, ) if sync_recv: + _xpu_comm_group_end() allgather_partial( tensor_recv_next, nranks=mp_degree, @@ -624,7 +648,7 @@ def _p2p_helper( ) else: tasks.append(task) - + _xpu_comm_group_end() if not sync_recv: if framework.in_dygraph_mode(): # wait irecv tasks in eager dygraph mode with new comm library diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py index 928c93df0b40a..4c47cbfcc1d0b 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py @@ -323,6 +323,6 @@ def cvt_to_device(x, dev_id, blocking=True): place = paddle.XPUPlace(dev_id) else: raise OSError( - "Only supported compiled paddle with gpu/rocm, npu and xpu , but current verison is compiled with cpu." + "Only supported compiled paddle with gpu/rocm and xpu , but current verison is compiled with cpu." ) return x._copy_to(place, blocking) diff --git a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py index c6acae878745b..a5689020eb009 100644 --- a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py +++ b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py @@ -110,7 +110,10 @@ def forward( cur_device = paddle.get_device() assert ( - 'gpu:' in paddle.get_device() or 'xpu:' in paddle.get_device() + 'gpu:' in paddle.get_device() + or 'xpu:' in paddle.get_device() + or cur_device.split(':')[0] + in paddle.device.get_all_custom_device_type() ), "Recompute with RNG is not support current device: {}.".format( cur_device ) diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py index 02099c743933e..a2c3e34f6dee5 100644 --- a/python/paddle/distributed/launch/main.py +++ b/python/paddle/distributed/launch/main.py @@ -52,7 +52,7 @@ def launch(): - ``--job_id``: The job unique id, it affects the log files' name. e.g., ``--job_id=job1``. Default ``--job_id=default``. - - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu/npu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device. + - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device. - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``training.py`` diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py index 3216c1b408276..6a763ce15030f 100644 --- a/python/paddle/distributed/passes/auto_parallel_fp16.py +++ b/python/paddle/distributed/passes/auto_parallel_fp16.py @@ -685,7 +685,7 @@ def _insert_memcopy(block, idx, src_var, dist_context, direction="D2H"): world_process_group.ranks, ) - # TODO to support CUDAPinned/NPU/XPU Places + # TODO to support CUDAPinned/XPU Places if direction == "D2H": dst_place_type = 0 else: diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 605db8e932bf8..e0e102e2393f9 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -55,8 +55,6 @@ from . import reader from . import unique_name from .reader import * -from . import dataloader -from .dataloader import * from . import core from paddle.utils import deprecated from paddle.fluid.framework import static_only diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index 4724077f1c9ec..ed294700b1621 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -40,14 +40,6 @@ _cleanup, _set_SIGCHLD_handler, ) -from .dataloader import BatchSampler, Dataset, IterableDataset, Subset -from .dataloader.dataloader_iter import ( - _DataLoaderIterSingleProcess, - _DataLoaderIterMultiProcess, - _DatasetKind, - default_collate_fn, -) -from .dataloader.batch_sampler import _InfiniteIterableSampler from .layers.io import ( monkey_patch_reader_methods, _copy_reader_var_, @@ -69,22 +61,12 @@ # NOTE: [ avoid hanging & failed quickly ] These value is used in getting data from another process QUEUE_GET_TIMEOUT = 60 -__all__ = ['PyReader', 'DataLoader', 'default_collate_fn'] +__all__ = ['PyReader', 'DataLoader'] data_loader_unique_name_generator = UniqueNameGenerator() KEEP_DATA_LOADER_ORDER = True USE_PINNED_MEMORY = None -# AutoTune Flags -USE_AUTOTUNE = False -TUNING_STEPS = 500 - - -def set_autotune_config(use_autotune, tuning_steps=500): - global USE_AUTOTUNE - USE_AUTOTUNE = use_autotune - global TUNING_STEPS - TUNING_STEPS = tuning_steps def keep_data_loader_order(*args): @@ -171,454 +153,7 @@ def _check_input_array(cls, item): return arr -class AuToTune: - def __init__(self, loader): - self.loader = loader - self.max_num_worker = multiprocessing.cpu_count() / 2 - - def __call__(self): - # use default loader - if (not USE_AUTOTUNE) or (not self.need_autotune()): - return self.loader.num_workers - - # get autotune loader - auto_tune_loader = self.get_autotune_loader() - if auto_tune_loader is None: - return self.loader.num_workers - - # pick the best num_workers - auto_tune_start = time.time() - logging.debug("========= DataLoader Auto Tune =========") - logging.debug( - "User config for DataLoader: " + str(self.loader.num_workers) - ) - best_num_workers = 0 - min_cost = float("inf") - logging.debug( - "Tuning Range for num_workers: 0 ~ " + str(self.max_num_worker) - ) - num_workers = 0 - while num_workers < self.max_num_worker: - auto_tune_loader.num_workers = num_workers - avg_cost = self.evaluate_reader_cost(auto_tune_loader) - if min_cost * 0.75 > avg_cost: - min_cost = avg_cost - best_num_workers = num_workers - else: - update_num = self.is_best( - auto_tune_loader, - best_num_workers, - min_cost, - self.max_num_worker, - ) - if update_num == best_num_workers: - break - else: - best_num_workers = update_num - logging.debug( - "num_workers: " - + str(num_workers) - + " avg_cost: " - + str(avg_cost) - ) - num_workers += 2 - logging.info( - "auto_tune dataLoader best_num_workers: " + str(best_num_workers) - ) - logging.debug( - "AutoTuning Cost for DataLoader: " - + str(time.time() - auto_tune_start) - + ' seconds' - ) - - # tune the default loader's num_workers - return best_num_workers - - def need_autotune(self): - if sys.platform == 'darwin' or sys.platform == 'win32': - return False - else: - return True - - def get_sub_dataset(self, dataset, batch_size): - num_samples = min(batch_size * TUNING_STEPS, len(dataset)) - sub_dataset = Subset(dataset, indices=list(range(num_samples))) - return sub_dataset - - def get_autotune_loader(self): - loader = copy.copy(self.loader) - batch_size = self.loader.batch_sampler.batch_size - if isinstance( - self.loader.batch_sampler, paddle.io.DistributedBatchSampler - ): - dataset = self.loader.batch_sampler.dataset - sub_dataset = self.get_sub_dataset(dataset, batch_size) - loader.batch_sampler = paddle.io.DistributedBatchSampler( - dataset=sub_dataset, - batch_size=batch_size, - num_replicas=self.loader.batch_sampler.nranks, - rank=self.loader.batch_sampler.local_rank, - shuffle=self.loader.batch_sampler.shuffle, - drop_last=self.loader.batch_sampler.drop_last, - ) - elif isinstance(self.loader.batch_sampler, paddle.io.BatchSampler): - dataset = self.loader.batch_sampler.sampler.data_source - sub_dataset = self.get_sub_dataset(dataset, batch_size) - loader.batch_sampler = paddle.io.BatchSampler( - dataset=sub_dataset, - batch_size=batch_size, - drop_last=self.loader.batch_sampler.drop_last, - ) - else: - loader = None - return loader - - def evaluate_reader_cost(self, reader): - costs = [] - avg_cost = 0 - start = time.time() - for i, data in enumerate(reader): - costs.append(time.time() - start) - start = time.time() - if len(costs) > 2: - avg_cost = sum(costs[2:]) / len(costs[2:]) - else: - avg_cost = sum(costs[0:]) / len(costs[0:]) - return avg_cost - - def is_best(self, reader, best_workers, best_time, num_work_boundary): - step = 0 - num_workers = best_workers + 1 - boundary = 1 - while num_workers < num_work_boundary and step < 5: - self.loader.num_workers = num_workers - time = self.evaluate_reader_cost(reader) - logging.debug( - "for back num_workers: " - + str(num_workers) - + " avg_cost: " - + str(time) - ) - step += 1 - if time < best_time * 0.70 * boundary: - return num_workers - else: - num_workers += 1 - boundary *= 0.80 - return best_workers - - class DataLoader: - """ - DataLoader prodives an iterator which iterates given dataset - once by the batch_sampler. - - DataLoader supports single-process and multi-prcess data loading, - multi-process workers will be used to load data asynchronously if - :attr:`num_workers` is set as a positive number. - - DataLoader supports map-style dataset and iterable-style dataset. - - For map-style datast(can get a sample from dataset with a given - index), please see :code:`paddle.io.Dataset`. - - For iterable-style datast(get samples from dataset iteratively, - like a Python iterator), please see :code:`paddle.io.IterableDataset`. - - For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler` - - .. note:: - GPU tensor operation is not supported in subprocess currently, - please don't use GPU tensor operations in pipeline which will - be performed in subprocess, such as dataset transforms, collte_fn, - etc. Numpy array and CPU tensor operation is supported. - - **Disable automatic batching** - - In certain cases such as some NLP tasks, instead of automatic batching, - handling batching manually in dataset is needed by users. For these - cases, automatic batching is disabled if both :attr:`batch_size` and - :attr:`batch_sampler` is set as None, each data got from :attr:`dataset` - should be batched data and will be processed with function define by - :attr:`collate_fn` or :attr:`default_collate_fn`. - - - .. note:: - When automatic batching is disabled, :attr:`default_collate_fn` will - do nothing to data from dataset. - - - Args: - dataset(Dataset): the dataset to load data from, should be an - instance of subclass of :code:`paddle.io.Dataset` or - :code:`paddle.io.IterableDataset`. - feed_list (list(Tensor)|tuple(Tensor), optional): feed Tensor list. - The Tensors should be created by :code:`paddle.static.data()`. - :attr:`feed_list` must be set if :attr:`return_list` is - False. Default None. - places(list(Place)|tuple(Place)|list(str), optional): a list of Place, - to put data onto, :attr:`places` can be None, if - :attr:`places` is None, default place(CPUPlace or CUDAPlace(0)) - will be used. Default None. If ``places`` is list of string, - the string in the list can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, - where ``x`` is the index of the GPUs. - return_list (bool, optional): whether the return value on each device is - presented as a list. If :attr:`return_list=False`, the return - value on each device would be a dict of str -> Tensor, where - the key of the dict is the name of each fed Tensors. If - :attr:`return_list=True`, the return value on each device would - be a list(Tensor). :attr:`return_list` can only be True - in dynamic graph mode. Default True. - batch_sampler(BatchSampler, optional): an instance of `paddle.io.BatchSampler` - to generate batch indices to draw samples from :attr:`dataset` - and combine a batch. Default None. - batch_size(int|None, optional): sample number in a mini-batch, a substitution - parameter for :attr:`batch_sampler`, if :attr:`batch_sampler` - is not set, a default `paddle.io.BatchSampler` will be used - and initialize by :attr:`batch_size`, :attr:`shuffle` and - :attr:`drop_last`. Default 1. - shuffle(bool, optional): whther to shuffle indices order before genrate - batch indices, a substitution parameter for :attr:`batch_sampler` - see :attr:`batch_size`. Default False. - drop_last(bool, optional): whether drop the last incomplete batch dataset size - is not divisible by the batch size, a substitution parameter - for :attr:`batch_sampler`, see :attr:`batch_size`. Default False - collate_fn(callable, optional): function to generate mini-batch data by merging - the sample list, None for only stack each fields of sample in axis - 0(same as :attr::`np.stack(..., axis=0)`). Default None - num_workers(int, optional): the number of subprocess to load data, 0 for no - subprocess used and loading data in main process. Default 0 - use_buffer_reader (bool, optional): whether to use bufferred reader. - If use_buffer_reader=True, the DataLoader would prefetch - batch data asynchronously, so it would speed up data feeding - and occupies a little more CPU or GPU memory, i.e., the memory - of one batch input data. Default True. - prefetch_factor (int, optional): Number of batch data the DataLoader would prefetch - if use_buffer_reader=True. Default 2. - use_shared_memory (bool, optional): whether to use shared memory to speed up - putting data into inter-process queue, set :attr:`use_shared_memory` - as True only when the shared memory space on your machine(e.g. - space of '/dev/shm' on Linux operating sysytem) is large enough. - Shared memory will only be enabled in multi-process mode(num_workers - > 0). Default True. - timeout(int, optional): the timeout value for getting data form output queue - of subprocesses. Default 0. - worker_init_fn(callable, optional): init function which will be called with - worker id on each subproces starting if not set as None. Default - None. - - Returns: - DataLoader: an iterable object for data iterating, each elemnet of the generated data is a Tensor. - - Examples: - - .. code-block:: python - - import numpy as np - - import paddle - import paddle.nn as nn - import paddle.nn.functional as F - from paddle.io import Dataset, BatchSampler, DataLoader - - BATCH_NUM = 20 - BATCH_SIZE = 16 - EPOCH_NUM = 4 - - IMAGE_SIZE = 784 - CLASS_NUM = 10 - - # define a random dataset - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([IMAGE_SIZE]).astype('float32') - label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) - - class SimpleNet(nn.Layer): - def __init__(self): - super().__init__() - self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM) - - def forward(self, image, label=None): - return self.fc(image) - - simple_net = SimpleNet() - opt = paddle.optimizer.SGD(learning_rate=1e-3, - parameters=simple_net.parameters()) - - loader = DataLoader(dataset, - batch_size=BATCH_SIZE, - shuffle=True, - drop_last=True, - num_workers=2) - - for e in range(EPOCH_NUM): - for i, (image, label) in enumerate(loader()): - out = simple_net(image) - loss = F.cross_entropy(out, label) - avg_loss = paddle.mean(loss) - avg_loss.backward() - opt.minimize(avg_loss) - simple_net.clear_gradients() - print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy()))) - - - .. note:: - For reading iterable dataset with multiprocess Dataloader, - please see :code:`paddle.io.IterableDataset` - - """ - - def __init__( - self, - dataset, - feed_list=None, - places=None, - return_list=True, - batch_sampler=None, - batch_size=1, - shuffle=False, - drop_last=False, - collate_fn=None, - num_workers=0, - use_buffer_reader=True, - prefetch_factor=2, - use_shared_memory=True, - timeout=0, - worker_init_fn=None, - persistent_workers=False, - ): - self.return_list = return_list - self.collate_fn = collate_fn - self.use_buffer_reader = use_buffer_reader - self.prefetch_factor = prefetch_factor - self.worker_init_fn = worker_init_fn - - self.dataset = dataset - - if not return_list and not _non_static_mode(): - assert ( - feed_list is not None - ), "feed_list should be set when return_list=False" - self.feed_list = feed_list - - if places is None: - places = _current_expected_place() - if isinstance(places, (list, tuple)): - places = _get_paddle_place_list(places) - else: - places = _get_paddle_place(places) - self.places = _convert_places(places) - - assert num_workers >= 0, "num_workers should be a non-negative value" - if num_workers > 0 and ( - sys.platform == 'darwin' or sys.platform == 'win32' - ): - warnings.warn( - "DataLoader with multi-process mode is not supported on MacOs and Windows currently." - " Please use signle-process mode with num_workers = 0 instead" - ) - num_workers = 0 - self.num_workers = num_workers - - assert prefetch_factor > 0, "prefetch_factor should be a positive value" - - self.use_shared_memory = use_shared_memory - if use_shared_memory and num_workers == 0: - self.use_shared_memory = False - - assert timeout >= 0, "timeout should be a non-negative value" - self.timeout = timeout - - if isinstance(dataset, IterableDataset): - self.dataset_kind = _DatasetKind.ITER - if shuffle: - raise ValueError( - "IterableDataset not support shuffle, but got shuffle={}".format( - shuffle - ) - ) - if batch_sampler is not None: - raise ValueError( - "IterableDataset expect unspecified batch_sampler" - ) - else: - self.dataset_kind = _DatasetKind.MAP - - if batch_sampler is not None: - assert batch_size == 1 and not shuffle and not drop_last, ( - "batch_size/shuffle/drop_last should not be set when " - "batch_sampler is given" - ) - self.batch_sampler = batch_sampler - self.batch_size = None - elif batch_size is None: - self.batch_sampler = None - self.batch_size = None - else: - assert batch_size > 0, ( - "batch_size should be None or a positive value when " - "batch_sampler is not given" - ) - self.batch_size = batch_size - if isinstance(dataset, IterableDataset): - self.batch_sampler = _InfiniteIterableSampler( - dataset, batch_size - ) - else: - self.batch_sampler = BatchSampler( - dataset=dataset, - batch_size=batch_size, - shuffle=shuffle, - drop_last=drop_last, - ) - - self.drop_last = drop_last - self.auto_collate_batch = self.batch_sampler is not None - - self.pin_memory = False - if _non_static_mode(): - self.pin_memory = ( - True if use_pinned_memory() is None else use_pinned_memory() - ) - - self._persistent_workers = persistent_workers - self._iterator = None - self.num_workers = AuToTune(self).__call__() - - def __len__(self): - if self.dataset_kind == _DatasetKind.ITER: - raise ValueError("length of IterableDataset not supported") - else: - if self.auto_collate_batch: - return len(self.batch_sampler) - else: - return len(self.dataset) - - def __iter__(self): - if self.num_workers == 0: - return _DataLoaderIterSingleProcess(self) - elif self._persistent_workers: - if self._iterator is None: - self._iterator = _DataLoaderIterMultiProcess(self) - else: - self._iterator._reset() - return self._iterator - else: - return _DataLoaderIterMultiProcess(self) - - def __call__(self): - return self.__iter__() - @staticmethod def from_generator( feed_list=None, @@ -793,7 +328,7 @@ def set_data_source(loader, places): label = static.data(name='label', shape=[None, 1], dtype='int64') # Define DataLoader - loader = paddle.io.DataLoader.from_generator(feed_list=[image, label], capacity=16, iterable=ITERABLE) + loader = paddle.fluid.io.DataLoader.from_generator(feed_list=[image, label], capacity=16, iterable=ITERABLE) # Define network loss = simple_net(image, label) @@ -867,7 +402,7 @@ def forward(self, x): adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) # create data loader - loader = paddle.io.DataLoader.from_generator(capacity=5) + loader = paddle.fluid.io.DataLoader.from_generator(capacity=5) loader.set_batch_generator(random_batch_reader()) for epoch_id in range(EPOCH_NUM): @@ -944,7 +479,7 @@ def from_dataset(dataset, places, drop_last=True): use_var=[image, label]) dataset.set_filelist(['a.txt', 'b.txt', 'c.txt']) - loader = paddle.io.DataLoader.from_dataset(dataset, static.cpu_places()) + loader = paddle.fluid.io.DataLoader.from_dataset(dataset, static.cpu_places()) """ return DatasetLoader(dataset, places, drop_last) diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py index c74e2b7adaa22..a2a9c9113271b 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py @@ -37,8 +37,8 @@ ) from paddle.distributed.sharding.group_sharded import group_sharded_parallel from paddle.distributed.utils.log_utils import get_logger -from paddle.fluid.dataloader.dataset import IterableDataset from paddle.incubate.distributed.utils.io import save_for_auto_inference +from paddle.io import IterableDataset from paddle.nn import Linear logger = get_logger("INFO", __file__) diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py index d7e09481a1c71..9e2b89b12860c 100755 --- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py +++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py @@ -406,7 +406,7 @@ def setUp(self): ] def test_main(self): - from paddle.fluid.dataloader.worker import _generate_states + from paddle.io.dataloader.worker import _generate_states for inp, outp in zip(self.inputs, self.outputs): out = _generate_states(*inp) diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py index 2b4d1a78d1ea3..bfd08f703c4f6 100644 --- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py +++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py @@ -19,8 +19,8 @@ from paddle import fluid from paddle.fluid import core -from paddle.fluid.dataloader.dataloader_iter import _worker_loop from paddle.io import BatchSampler, DataLoader, Dataset, IterableDataset +from paddle.io.dataloader.worker import _worker_loop class RandomDataset(Dataset): diff --git a/python/paddle/incubate/autotune.py b/python/paddle/incubate/autotune.py index 742c5bded064a..dfad1dc58c928 100644 --- a/python/paddle/incubate/autotune.py +++ b/python/paddle/incubate/autotune.py @@ -84,7 +84,7 @@ def set_config(config=None): if config is None: core.enable_autotune() core.enable_layout_autotune() - paddle.fluid.reader.set_autotune_config(use_autotune=True) + paddle.io.reader.set_autotune_config(use_autotune=True) return config_dict = {} @@ -147,7 +147,7 @@ def set_config(config=None): ) if "tuning_steps" in dataloader_config: if isinstance(dataloader_config['tuning_steps'], int): - paddle.fluid.reader.set_autotune_config( + paddle.io.reader.set_autotune_config( use_autoune, dataloader_config['tuning_steps'] ) else: @@ -155,4 +155,4 @@ def set_config(config=None): "The auto-tuning configuration of the dataloader is incorrect." "The `tuning_steps` should be int. Use default parameter instead." ) - paddle.fluid.reader.set_autotune_config(use_autoune) + paddle.io.reader.set_autotune_config(use_autoune) diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py index a9c0e9a2f2d2f..6c2e0dae67834 100755 --- a/python/paddle/io/__init__.py +++ b/python/paddle/io/__init__.py @@ -14,21 +14,21 @@ # TODO: define all functions about input & output in this directory -from ..fluid.io import DataLoader # noqa: F401 -from ..fluid.dataloader import Dataset # noqa: F401 -from ..fluid.dataloader import IterableDataset # noqa: F401 -from ..fluid.dataloader import BatchSampler # noqa: F401 -from ..fluid.dataloader import get_worker_info # noqa: F401 -from ..fluid.dataloader import TensorDataset # noqa: F401 -from ..fluid.dataloader import Sampler # noqa: F401 -from ..fluid.dataloader import SequenceSampler # noqa: F401 -from ..fluid.dataloader import RandomSampler # noqa: F401 -from ..fluid.dataloader import DistributedBatchSampler # noqa: F401 -from ..fluid.dataloader import ComposeDataset # noqa: F401 -from ..fluid.dataloader import ChainDataset # noqa: F401 -from ..fluid.dataloader import WeightedRandomSampler # noqa: F401 -from ..fluid.dataloader import Subset # noqa: F401 -from ..fluid.dataloader import random_split # noqa: F401 +from .reader import DataLoader # noqa: F401 +from .dataloader import Dataset # noqa: F401 +from .dataloader import IterableDataset # noqa: F401 +from .dataloader import BatchSampler # noqa: F401 +from .dataloader import get_worker_info # noqa: F401 +from .dataloader import TensorDataset # noqa: F401 +from .dataloader import Sampler # noqa: F401 +from .dataloader import SequenceSampler # noqa: F401 +from .dataloader import RandomSampler # noqa: F401 +from .dataloader import DistributedBatchSampler # noqa: F401 +from .dataloader import ComposeDataset # noqa: F401 +from .dataloader import ChainDataset # noqa: F401 +from .dataloader import WeightedRandomSampler # noqa: F401 +from .dataloader import Subset # noqa: F401 +from .dataloader import random_split # noqa: F401 __all__ = [ # noqa 'Dataset', diff --git a/python/paddle/fluid/dataloader/__init__.py b/python/paddle/io/dataloader/__init__.py similarity index 55% rename from python/paddle/fluid/dataloader/__init__.py rename to python/paddle/io/dataloader/__init__.py index c0b2052283b1c..bb65463f70afc 100644 --- a/python/paddle/fluid/dataloader/__init__.py +++ b/python/paddle/io/dataloader/__init__.py @@ -12,21 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -from . import dataset -from .dataset import * +from .dataset import Dataset +from .dataset import IterableDataset +from .dataset import TensorDataset +from .dataset import ComposeDataset +from .dataset import ChainDataset +from .dataset import random_split +from .dataset import Subset -from . import batch_sampler -from .batch_sampler import * +from .batch_sampler import BatchSampler +from .batch_sampler import DistributedBatchSampler -from . import dataloader_iter -from .dataloader_iter import * +from .worker import get_worker_info -from . import sampler -from .sampler import * - -__all__ = ( - dataset.__all__ - + batch_sampler.__all__ - + dataloader_iter.__all__ - + sampler.__all__ -) +from .sampler import Sampler +from .sampler import SequenceSampler +from .sampler import RandomSampler +from .sampler import WeightedRandomSampler diff --git a/python/paddle/fluid/dataloader/batch_sampler.py b/python/paddle/io/dataloader/batch_sampler.py similarity index 98% rename from python/paddle/fluid/dataloader/batch_sampler.py rename to python/paddle/io/dataloader/batch_sampler.py index 3e0449719c4cd..190e9240900f8 100644 --- a/python/paddle/fluid/dataloader/batch_sampler.py +++ b/python/paddle/io/dataloader/batch_sampler.py @@ -12,13 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import math -from .sampler import Sampler, SequenceSampler, RandomSampler -from .dataset import Dataset, IterableDataset +import numpy as np -__all__ = ["BatchSampler", "DistributedBatchSampler"] +from .dataset import IterableDataset +from .sampler import RandomSampler, Sampler, SequenceSampler class BatchSampler(Sampler): diff --git a/python/paddle/fluid/dataloader/collate.py b/python/paddle/io/dataloader/collate.py similarity index 97% rename from python/paddle/fluid/dataloader/collate.py rename to python/paddle/io/dataloader/collate.py index dd70a3421409d..141624668f09b 100644 --- a/python/paddle/fluid/dataloader/collate.py +++ b/python/paddle/io/dataloader/collate.py @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle import numbers +from collections.abc import Mapping, Sequence + import numpy as np -from ..framework import _non_static_mode -from .. import core, layers -from collections.abc import Sequence, Mapping +import paddle + +from ...framework import core def default_collate_fn(batch): diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/io/dataloader/dataloader_iter.py similarity index 98% rename from python/paddle/fluid/dataloader/dataloader_iter.py rename to python/paddle/io/dataloader/dataloader_iter.py index 2b06c371ef36f..43b749c869dd6 100644 --- a/python/paddle/fluid/dataloader/dataloader_iter.py +++ b/python/paddle/io/dataloader/dataloader_iter.py @@ -12,51 +12,39 @@ # See the License for the specific language governing permissions and # limitations under the License. +import itertools +import logging import os +import queue import sys -import time -import signal -import numbers -import logging -import itertools import threading +import time import warnings -import numpy as np -from collections import namedtuple -from paddle.fluid.framework import ( - _set_expected_place, - _current_expected_place, - set_flags, -) -import queue +import numpy as np import paddle -import paddle.profiler as profiler +from paddle import profiler +from paddle.fluid.framework import _current_expected_place, _set_expected_place +from paddle.profiler.timer import benchmark from paddle.profiler.utils import in_profiler_mode -from .. import core, layers -from ..framework import in_dygraph_mode + +from ...framework import core, in_dygraph_mode from ..multiprocess_utils import ( - _set_SIGCHLD_handler, MP_STATUS_CHECK_INTERVAL, CleanupFuncRegistrar, + _set_SIGCHLD_handler, ) -from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher from .batch_sampler import _InfiniteIterableSampler from .collate import default_collate_fn, default_convert_fn +from .flat import _flatten_batch, _restore_batch from .worker import ( - ParentWatchDog, - get_worker_info, - _worker_loop, _DatasetKind, _IterableDatasetStopIteration, - _WorkerException, _ResumeIteration, + _worker_loop, + _WorkerException, ) -from .flat import _flatten_batch, _restore_batch -from paddle.profiler.timer import benchmark - -__all__ = ['get_worker_info'] # NOTE: fix `terminate called without an active exception` # if for loop break and program exit immediately(with no model @@ -95,7 +83,7 @@ class _DataLoaderIterBase: data by setting in given dataloader. Args: - loader(instance of DataLoader): instance of `fluid.io.DataLoader` + loader(instance of DataLoader): instance of `paddle.io.DataLoader` """ def __init__(self, loader): @@ -439,7 +427,7 @@ def __init__(self, loader): self._shutdown = False def _init_workers(self): - import paddle.incubate.multiprocessing as multiprocessing + from paddle.incubate import multiprocessing # multiprocess worker and indice queue list initial as empty self._workers = [] diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/io/dataloader/dataset.py similarity index 98% rename from python/paddle/fluid/dataloader/dataset.py rename to python/paddle/io/dataloader/dataset.py index 3701da0b33ec7..e8bb6bbd364c8 100755 --- a/python/paddle/fluid/dataloader/dataset.py +++ b/python/paddle/io/dataloader/dataset.py @@ -13,17 +13,8 @@ # limitations under the License. import paddle -from .. import framework - -__all__ = [ - "Dataset", - "IterableDataset", - "TensorDataset", - "ComposeDataset", - "ChainDataset", - "random_split", - "Subset", -] + +from ... import framework class Dataset: diff --git a/python/paddle/fluid/dataloader/fetcher.py b/python/paddle/io/dataloader/fetcher.py similarity index 60% rename from python/paddle/fluid/dataloader/fetcher.py rename to python/paddle/io/dataloader/fetcher.py index b097a315c0c73..309d009cfc106 100644 --- a/python/paddle/fluid/dataloader/fetcher.py +++ b/python/paddle/io/dataloader/fetcher.py @@ -12,12 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging -from ..log_helper import get_logger -from collections.abc import Sequence, Mapping - -_WARNING_TO_LOG = True - class _DatasetFetcher: def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last): @@ -37,47 +31,8 @@ def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last): # ecah sample processing in the batch def fetch(self, batch_indices, done_event=None): raise NotImplementedError( - "'fetch' not implement for class {}".format(self.__class__.__name__) - ) - - def _log_warning(self): - # only log warning on GPU 0 when distributed launch - from ...distributed import get_world_size, get_rank - - if get_world_size() >= 2 and get_rank() != 0: - return - - warn_str = ( - "Detect dataset only contains single fileds, return format " - "changed since Paddle 2.1. In Paddle <= 2.0, DataLoader add " - "a list surround output data(e.g. return [data]), and in " - "Paddle >= 2.1, DataLoader return the single filed directly " - "(e.g. return data). For example, in following code: \n\n" - ) - warn_str += ( - "import numpy as np\n" - "from paddle.io import DataLoader, Dataset\n\n" - "class RandomDataset(Dataset):\n" - " def __getitem__(self, idx):\n" - " data = np.random.random((2, 3)).astype('float32')\n\n" - " return data\n\n" - " def __len__(self):\n" - " return 10\n\n" - "dataset = RandomDataset()\n" - "loader = DataLoader(dataset, batch_size=1)\n" - "data = next(loader())\n\n" - ) - - warn_str += ( - "In Paddle <= 2.0, data is in format '[Tensor(shape=(1, 2, 3), " - "dtype=float32)]', and in Paddle >= 2.1, data is in format" - " 'Tensor(shape=(1, 2, 3), dtype=float32)'\n" - ) - - logger = get_logger( - "DataLoader", logging.INFO, fmt='%(levelname)s: %(message)s' + f"'fetch' not implement for class {self.__class__.__name__}" ) - logger.warning(warn_str) class _IterableDatasetFetcher(_DatasetFetcher): @@ -103,10 +58,6 @@ def fetch(self, batch_indices, done_event=None): ): raise StopIteration - global _WARNING_TO_LOG - if not isinstance(data[0], (Sequence, Mapping)) and _WARNING_TO_LOG: - self._log_warning() - _WARNING_TO_LOG = False else: data = next(self.dataset_iter) @@ -128,10 +79,6 @@ def fetch(self, batch_indices, done_event=None): else: return None - global _WARNING_TO_LOG - if not isinstance(data[0], (Sequence, Mapping)) and _WARNING_TO_LOG: - self._log_warning() - _WARNING_TO_LOG = False else: data = self.dataset[batch_indices] diff --git a/python/paddle/fluid/dataloader/flat.py b/python/paddle/io/dataloader/flat.py similarity index 93% rename from python/paddle/fluid/dataloader/flat.py rename to python/paddle/io/dataloader/flat.py index 1e1ed1eebd806..f674d7fb2b4b9 100644 --- a/python/paddle/fluid/dataloader/flat.py +++ b/python/paddle/io/dataloader/flat.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle import numbers -import numpy as np +from collections.abc import Mapping, Sequence -from collections.abc import Sequence, Mapping +import numpy as np +import paddle FIELD_PREFIX = "_paddle_field_" @@ -38,7 +38,7 @@ def _flatten(batch, flat_batch, structure, field_idx): field, (np.ndarray, paddle.Tensor, paddle.fluid.core.eager.Tensor), ): - structure.append('{}{}'.format(FIELD_PREFIX, field_idx)) + structure.append(f'{FIELD_PREFIX}{field_idx}') flat_batch.append(field) field_idx += 1 elif isinstance(field, (str, bytes, numbers.Number)): @@ -61,7 +61,7 @@ def _flatten(batch, flat_batch, structure, field_idx): field, (np.ndarray, paddle.Tensor, paddle.fluid.core.eager.Tensor), ): - structure[k] = '{}{}'.format(FIELD_PREFIX, field_idx) + structure[k] = f'{FIELD_PREFIX}{field_idx}' flat_batch.append(field) field_idx += 1 elif isinstance(field, (str, bytes, numbers.Number)): @@ -79,7 +79,7 @@ def _flatten(batch, flat_batch, structure, field_idx): else: structure[k] = field else: - raise TypeError("wrong flat data type: {}".format(type(batch))) + raise TypeError(f"wrong flat data type: {type(batch)}") return structure, field_idx @@ -130,7 +130,7 @@ def _restore(structure, field_idx): elif isinstance(field, (Sequence, Mapping)): field_idx = _restore(structure[k], field_idx) else: - raise TypeError("wrong flat data type: {}".format(type(structure))) + raise TypeError(f"wrong flat data type: {type(structure)}") return field_idx @@ -145,7 +145,7 @@ def _restore(structure, field_idx): if isinstance(structure, (str, bytes)): assert structure == '{}{}'.format( FIELD_PREFIX, 0 - ), "invalid structure: {}".format(structure) + ), f"invalid structure: {structure}" return flat_batch[0] field_idx = _restore(structure, 0) assert field_idx + 1 == len(flat_batch), "Tensor parse incomplete" diff --git a/python/paddle/fluid/dataloader/sampler.py b/python/paddle/io/dataloader/sampler.py similarity index 98% rename from python/paddle/fluid/dataloader/sampler.py rename to python/paddle/io/dataloader/sampler.py index a6ec3ffbae9b8..aa8a4e649c76c 100644 --- a/python/paddle/fluid/dataloader/sampler.py +++ b/python/paddle/io/dataloader/sampler.py @@ -13,14 +13,8 @@ # limitations under the License. import numpy as np -from .. import core -__all__ = [ - "Sampler", - "SequenceSampler", - "RandomSampler", - "WeightedRandomSampler", -] +from ...framework import core class Sampler: @@ -317,7 +311,7 @@ def __iter__(self): idxs = _weighted_sample( self.weights, self.num_samples, self.replacement ) - return iter(idxs.reshape((-1)).tolist()) + return iter(idxs.reshape(-1).tolist()) def __len__(self): mul = np.prod(self.weights.shape) // self.weights.shape[-1] diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/io/dataloader/worker.py similarity index 98% rename from python/paddle/fluid/dataloader/worker.py rename to python/paddle/io/dataloader/worker.py index de6c382054e0a..4ca80e09ae65e 100644 --- a/python/paddle/fluid/dataloader/worker.py +++ b/python/paddle/io/dataloader/worker.py @@ -13,25 +13,25 @@ # limitations under the License. import os + +# NOTE: queue has a different name in python2 and python3 +import queue import sys -import paddle -import numpy as np import traceback -from collections import namedtuple -from .. import core -from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher + +import numpy as np + +import paddle + +from ...framework import core from ..multiprocess_utils import ( - _cleanup_mmap, - CleanupFuncRegistrar, MP_STATUS_CHECK_INTERVAL, + CleanupFuncRegistrar, + _cleanup_mmap, ) -from ..framework import _non_static_mode, _in_eager_without_dygraph_check +from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher from .flat import _flatten_batch -import queue - -__all__ = ['get_worker_info'] - class _IterableDatasetStopIteration: def __init__(self, worker_id): @@ -59,7 +59,7 @@ def create_fetcher( dataset, auto_collate_batch, collate_fn, drop_last ) else: - raise NotImplementedError("unknown Dataset kind {}".format(kind)) + raise NotImplementedError(f"unknown Dataset kind {kind}") class ParentWatchDog: @@ -291,9 +291,9 @@ def _worker_loop( # set different numpy seed for each worker try: - import numpy as np - import time import random + + import numpy as np except ImportError: pass else: diff --git a/python/paddle/io/multiprocess_utils.py b/python/paddle/io/multiprocess_utils.py new file mode 100644 index 0000000000000..5792983ceb475 --- /dev/null +++ b/python/paddle/io/multiprocess_utils.py @@ -0,0 +1,140 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import atexit + +# NOTE: queue has a different name in python2 and python3 +import queue +import signal +import sys + +from ..framework import core + +# multi-process worker check indices queue interval, avoid +# hanging in subprocess data loading +MP_STATUS_CHECK_INTERVAL = 5.0 + +# NOTE: [ mmap files clear ] If there is still data in the multiprocess queue when the main process finishes reading, +# the data in the queue needs to be popped. Then the LoDTensor read by the main process +# from the child process will automatically clear the memory-mapped file. +multiprocess_queue_set = set() + + +def _clear_multiprocess_queue_set(): + global multiprocess_queue_set + for data_queue in multiprocess_queue_set: + while True: + try: + data_queue.get_nowait() + except queue.Empty: + break + + +# NOTE: main process clear function at exit +def _cleanup(): + # NOTE: inter-process Queue shared memory objects clear function + _clear_multiprocess_queue_set() + # NOTE: main process memory map files clear funciton + core._cleanup_mmap_fds() + + +# NOTE: for child process clear function at exit +def _cleanup_mmap(): + # clear memory map files in child process + core._cleanup_mmap_fds() + + +# NOTE used for register a function to be executed at interpreter exit. +class CleanupFuncRegistrar: + # Record the cleanup functions that have been executed + _executed_func_set = set() + # Record the cleanup functions that have been registered + _registered_func_set = set() + + @classmethod + def register(cls, function, signals=[]): + def _func_exectuor(): + if function not in cls._executed_func_set: + try: + function() + finally: + cls._executed_func_set.add(function) + + def _func_register(function): + if not callable(function): + raise TypeError("%s is not callable object." % (function)) + # check function object whether hash-able {function} + if function not in cls._registered_func_set: + atexit.register(_func_exectuor) + cls._registered_func_set.add(function) + + def _signal_handler(signum=None, frame=None): + _func_exectuor() + if signum is not None: + if signum == signal.SIGINT: + raise KeyboardInterrupt + sys.exit(signum) + + def _signal_register(signals): + signals = set(signals) + for sig in signals: + orig_handler = signal.signal(sig, _signal_handler) + if orig_handler not in (signal.SIG_DFL, signal.SIG_IGN): + if ( + sig == signal.SIGINT + and orig_handler is signal.default_int_handler + ): + continue + if orig_handler not in cls._registered_func_set: + atexit.register(orig_handler) + cls._registered_func_set.add(orig_handler) + + # deal with signals + _signal_register(signals) + # deal with function + _func_register(function) + + +# NOTE: [ mmap files clear ] When the main process exits unexpectedly, the remaining +# shared memory objects in the inter-process Queue and the main process (mostly in the +# BlockingQueue) may not be completely released, resulting in the corresponding +# memory-mapped file remaining on the disk (/dev/shm), so register this function +# to clean up shared memory objects in these two queues before the python interpreter exits. +# NOTE: Currently multi-process DataLoader only supports Linux platform +if not (sys.platform == 'darwin' or sys.platform == 'win32'): + CleanupFuncRegistrar.register(_cleanup) + +# ------------ SIGCHLD handler setting -------------- +_SIGCHLD_handler_set = False + + +def _set_SIGCHLD_handler(): + global _SIGCHLD_handler_set + if _SIGCHLD_handler_set: + return + + current_handler = signal.getsignal(signal.SIGCHLD) + if not callable(current_handler): + current_handler = None + + def __handler__(signum, frame): + # NOTE: Here the signum is SIGCHLD, when the child process exits, + # this handler will be called whenever the child process exits + # normally or abnormally. + core._throw_error_if_process_failed() + if current_handler is not None: + current_handler(signum, frame) + + signal.signal(signal.SIGCHLD, __handler__) + _SIGCHLD_handler_set = True diff --git a/python/paddle/io/reader.py b/python/paddle/io/reader.py new file mode 100644 index 0000000000000..6698caa435f94 --- /dev/null +++ b/python/paddle/io/reader.py @@ -0,0 +1,528 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import multiprocessing + +# NOTE: queue has a different name in python2 and python3 +import sys +import time +import warnings + +import paddle +from paddle.fluid.framework import logging + +from ..fluid.framework import ( + _current_expected_place, + _get_paddle_place, + _get_paddle_place_list, + _non_static_mode, +) +from ..framework import core +from .dataloader import BatchSampler, IterableDataset, Subset +from .dataloader.batch_sampler import _InfiniteIterableSampler +from .dataloader.dataloader_iter import ( + _DataLoaderIterMultiProcess, + _DataLoaderIterSingleProcess, + _DatasetKind, +) + +# NOTE: [ avoid hanging & failed quickly ] +# These value is used in getting data from another process +QUEUE_GET_TIMEOUT = 60 + +USE_PINNED_MEMORY = None +# AutoTune Flags +USE_AUTOTUNE = False +TUNING_STEPS = 500 + + +def set_autotune_config(use_autotune, tuning_steps=500): + global USE_AUTOTUNE + USE_AUTOTUNE = use_autotune + global TUNING_STEPS + TUNING_STEPS = tuning_steps + + +def use_pinned_memory(*args): + global USE_PINNED_MEMORY + if len(args) == 0: + return USE_PINNED_MEMORY + else: + assert len(args) == 1 and isinstance(args[0], bool) + USE_PINNED_MEMORY = args[0] + + +def _convert_places(places): + if not isinstance(places, (list, tuple)): + places = [places] + + ret = [] + for p in places: + if not isinstance(p, core.Place): + tmp = core.Place() + tmp.set_place(p) + p = tmp + + ret.append(p) + return ret + + +class AuToTune: + def __init__(self, loader): + self.loader = loader + self.max_num_worker = multiprocessing.cpu_count() / 2 + + def __call__(self): + # use default loader + if (not USE_AUTOTUNE) or (not self.need_autotune()): + return self.loader.num_workers + + # get autotune loader + auto_tune_loader = self.get_autotune_loader() + if auto_tune_loader is None: + return self.loader.num_workers + + # pick the best num_workers + auto_tune_start = time.time() + logging.debug("========= DataLoader Auto Tune =========") + logging.debug( + "User config for DataLoader: " + str(self.loader.num_workers) + ) + best_num_workers = 0 + min_cost = float("inf") + logging.debug( + "Tuning Range for num_workers: 0 ~ " + str(self.max_num_worker) + ) + num_workers = 0 + while num_workers < self.max_num_worker: + auto_tune_loader.num_workers = num_workers + avg_cost = self.evaluate_reader_cost(auto_tune_loader) + if min_cost * 0.75 > avg_cost: + min_cost = avg_cost + best_num_workers = num_workers + else: + update_num = self.is_best( + auto_tune_loader, + best_num_workers, + min_cost, + self.max_num_worker, + ) + if update_num == best_num_workers: + break + else: + best_num_workers = update_num + logging.debug( + "num_workers: " + + str(num_workers) + + " avg_cost: " + + str(avg_cost) + ) + num_workers += 2 + logging.info( + "auto_tune dataLoader best_num_workers: " + str(best_num_workers) + ) + logging.debug( + "AutoTuning Cost for DataLoader: " + + str(time.time() - auto_tune_start) + + ' seconds' + ) + + # tune the default loader's num_workers + return best_num_workers + + def need_autotune(self): + if sys.platform == 'darwin' or sys.platform == 'win32': + return False + else: + return True + + def get_sub_dataset(self, dataset, batch_size): + num_samples = min(batch_size * TUNING_STEPS, len(dataset)) + sub_dataset = Subset(dataset, indices=list(range(num_samples))) + return sub_dataset + + def get_autotune_loader(self): + loader = copy.copy(self.loader) + batch_size = self.loader.batch_sampler.batch_size + if isinstance( + self.loader.batch_sampler, paddle.io.DistributedBatchSampler + ): + dataset = self.loader.batch_sampler.dataset + sub_dataset = self.get_sub_dataset(dataset, batch_size) + loader.batch_sampler = paddle.io.DistributedBatchSampler( + dataset=sub_dataset, + batch_size=batch_size, + num_replicas=self.loader.batch_sampler.nranks, + rank=self.loader.batch_sampler.local_rank, + shuffle=self.loader.batch_sampler.shuffle, + drop_last=self.loader.batch_sampler.drop_last, + ) + elif isinstance(self.loader.batch_sampler, paddle.io.BatchSampler): + dataset = self.loader.batch_sampler.sampler.data_source + sub_dataset = self.get_sub_dataset(dataset, batch_size) + loader.batch_sampler = paddle.io.BatchSampler( + dataset=sub_dataset, + batch_size=batch_size, + drop_last=self.loader.batch_sampler.drop_last, + ) + else: + loader = None + return loader + + def evaluate_reader_cost(self, reader): + costs = [] + avg_cost = 0 + start = time.time() + for i, data in enumerate(reader): + costs.append(time.time() - start) + start = time.time() + if len(costs) > 2: + avg_cost = sum(costs[2:]) / len(costs[2:]) + else: + avg_cost = sum(costs[0:]) / len(costs[0:]) + return avg_cost + + def is_best(self, reader, best_workers, best_time, num_work_boundary): + step = 0 + num_workers = best_workers + 1 + boundary = 1 + while num_workers < num_work_boundary and step < 5: + self.loader.num_workers = num_workers + time = self.evaluate_reader_cost(reader) + logging.debug( + "for back num_workers: " + + str(num_workers) + + " avg_cost: " + + str(time) + ) + step += 1 + if time < best_time * 0.70 * boundary: + return num_workers + else: + num_workers += 1 + boundary *= 0.80 + return best_workers + + +class DataLoader: + """ + DataLoader prodives an iterator which iterates given dataset + once by the batch_sampler. + + DataLoader supports single-process and multi-prcess data loading, + multi-process workers will be used to load data asynchronously if + :attr:`num_workers` is set as a positive number. + + DataLoader supports map-style dataset and iterable-style dataset. + + For map-style datast(can get a sample from dataset with a given + index), please see :code:`paddle.io.Dataset`. + + For iterable-style datast(get samples from dataset iteratively, + like a Python iterator), please see :code:`paddle.io.IterableDataset`. + + For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler` + + .. note:: + GPU tensor operation is not supported in subprocess currently, + please don't use GPU tensor operations in pipeline which will + be performed in subprocess, such as dataset transforms, collte_fn, + etc. Numpy array and CPU tensor operation is supported. + + **Disable automatic batching** + + In certain cases such as some NLP tasks, instead of automatic batching, + handling batching manually in dataset is needed by users. For these + cases, automatic batching is disabled if both :attr:`batch_size` and + :attr:`batch_sampler` is set as None, each data got from :attr:`dataset` + should be batched data and will be processed with function define by + :attr:`collate_fn` or :attr:`default_collate_fn`. + + + .. note:: + When automatic batching is disabled, :attr:`default_collate_fn` will + do nothing to data from dataset. + + + Args: + dataset(Dataset): the dataset to load data from, should be an + instance of subclass of :code:`paddle.io.Dataset` or + :code:`paddle.io.IterableDataset`. + feed_list (list(Tensor)|tuple(Tensor), optional): feed Tensor list. + The Tensors should be created by :code:`paddle.static.data()`. + :attr:`feed_list` must be set if :attr:`return_list` is + False. Default None. + places(list(Place)|tuple(Place)|list(str), optional): a list of Place, + to put data onto, :attr:`places` can be None, if + :attr:`places` is None, default place(CPUPlace or CUDAPlace(0)) + will be used. Default None. If ``places`` is list of string, + the string in the list can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, + where ``x`` is the index of the GPUs. + return_list (bool, optional): whether the return value on each device is + presented as a list. If :attr:`return_list=False`, the return + value on each device would be a dict of str -> Tensor, where + the key of the dict is the name of each fed Tensors. If + :attr:`return_list=True`, the return value on each device would + be a list(Tensor). :attr:`return_list` can only be True + in dynamic graph mode. Default True. + batch_sampler(BatchSampler, optional): an instance of `paddle.io.BatchSampler` + to generate batch indices to draw samples from :attr:`dataset` + and combine a batch. Default None. + batch_size(int|None, optional): sample number in a mini-batch, a substitution + parameter for :attr:`batch_sampler`, if :attr:`batch_sampler` + is not set, a default `paddle.io.BatchSampler` will be used + and initialize by :attr:`batch_size`, :attr:`shuffle` and + :attr:`drop_last`. Default 1. + shuffle(bool, optional): whther to shuffle indices order before genrate + batch indices, a substitution parameter for :attr:`batch_sampler` + see :attr:`batch_size`. Default False. + drop_last(bool, optional): whether drop the last incomplete batch dataset size + is not divisible by the batch size, a substitution parameter + for :attr:`batch_sampler`, see :attr:`batch_size`. Default False + collate_fn(callable, optional): function to generate mini-batch data by merging + the sample list, None for only stack each fields of sample in axis + 0(same as :attr::`np.stack(..., axis=0)`). Default None + num_workers(int, optional): the number of subprocess to load data, 0 for no + subprocess used and loading data in main process. Default 0 + use_buffer_reader (bool, optional): whether to use bufferred reader. + If use_buffer_reader=True, the DataLoader would prefetch + batch data asynchronously, so it would speed up data feeding + and occupies a little more CPU or GPU memory, i.e., the memory + of one batch input data. Default True. + prefetch_factor (int, optional): Number of batch data the DataLoader would prefetch + if use_buffer_reader=True. Default 2. + use_shared_memory (bool, optional): whether to use shared memory to speed up + putting data into inter-process queue, set :attr:`use_shared_memory` + as True only when the shared memory space on your machine(e.g. + space of '/dev/shm' on Linux operating sysytem) is large enough. + Shared memory will only be enabled in multi-process mode(num_workers + > 0). Default True. + timeout(int, optional): the timeout value for getting data form output queue + of subprocesses. Default 0. + worker_init_fn(callable, optional): init function which will be called with + worker id on each subproces starting if not set as None. Default + None. + + Returns: + DataLoader: an iterable object for data iterating, each elemnet of the generated data is a Tensor. + + Examples: + + .. code-block:: python + + import numpy as np + + import paddle + import paddle.nn as nn + import paddle.nn.functional as F + from paddle.io import Dataset, BatchSampler, DataLoader + + BATCH_NUM = 20 + BATCH_SIZE = 16 + EPOCH_NUM = 4 + + IMAGE_SIZE = 784 + CLASS_NUM = 10 + + # define a random dataset + class RandomDataset(Dataset): + def __init__(self, num_samples): + self.num_samples = num_samples + + def __getitem__(self, idx): + image = np.random.random([IMAGE_SIZE]).astype('float32') + label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64') + return image, label + + def __len__(self): + return self.num_samples + + dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) + + class SimpleNet(nn.Layer): + def __init__(self): + super().__init__() + self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM) + + def forward(self, image, label=None): + return self.fc(image) + + simple_net = SimpleNet() + opt = paddle.optimizer.SGD(learning_rate=1e-3, + parameters=simple_net.parameters()) + + loader = DataLoader(dataset, + batch_size=BATCH_SIZE, + shuffle=True, + drop_last=True, + num_workers=2) + + for e in range(EPOCH_NUM): + for i, (image, label) in enumerate(loader()): + out = simple_net(image) + loss = F.cross_entropy(out, label) + avg_loss = paddle.mean(loss) + avg_loss.backward() + opt.minimize(avg_loss) + simple_net.clear_gradients() + print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy()))) + + + .. note:: + For reading iterable dataset with multiprocess Dataloader, + please see :code:`paddle.io.IterableDataset` + + """ + + def __init__( + self, + dataset, + feed_list=None, + places=None, + return_list=True, + batch_sampler=None, + batch_size=1, + shuffle=False, + drop_last=False, + collate_fn=None, + num_workers=0, + use_buffer_reader=True, + prefetch_factor=2, + use_shared_memory=True, + timeout=0, + worker_init_fn=None, + persistent_workers=False, + ): + self.return_list = return_list + self.collate_fn = collate_fn + self.use_buffer_reader = use_buffer_reader + self.prefetch_factor = prefetch_factor + self.worker_init_fn = worker_init_fn + + self.dataset = dataset + + if not return_list and not _non_static_mode(): + assert ( + feed_list is not None + ), "feed_list should be set when return_list=False" + self.feed_list = feed_list + + if places is None: + places = _current_expected_place() + if isinstance(places, (list, tuple)): + places = _get_paddle_place_list(places) + else: + places = _get_paddle_place(places) + self.places = _convert_places(places) + + assert num_workers >= 0, "num_workers should be a non-negative value" + if num_workers > 0 and ( + sys.platform == 'darwin' or sys.platform == 'win32' + ): + warnings.warn( + "DataLoader with multi-process mode is not supported on MacOs and Windows currently." + " Please use signle-process mode with num_workers = 0 instead" + ) + num_workers = 0 + self.num_workers = num_workers + + assert prefetch_factor > 0, "prefetch_factor should be a positive value" + + self.use_shared_memory = use_shared_memory + if use_shared_memory and num_workers == 0: + self.use_shared_memory = False + + assert timeout >= 0, "timeout should be a non-negative value" + self.timeout = timeout + + if isinstance(dataset, IterableDataset): + self.dataset_kind = _DatasetKind.ITER + if shuffle: + raise ValueError( + "IterableDataset not support shuffle, but got shuffle={}".format( + shuffle + ) + ) + if batch_sampler is not None: + raise ValueError( + "IterableDataset expect unspecified batch_sampler" + ) + else: + self.dataset_kind = _DatasetKind.MAP + + if batch_sampler is not None: + assert batch_size == 1 and not shuffle and not drop_last, ( + "batch_size/shuffle/drop_last should not be set when " + "batch_sampler is given" + ) + self.batch_sampler = batch_sampler + self.batch_size = None + elif batch_size is None: + self.batch_sampler = None + self.batch_size = None + else: + assert batch_size > 0, ( + "batch_size should be None or a positive value when " + "batch_sampler is not given" + ) + self.batch_size = batch_size + if isinstance(dataset, IterableDataset): + self.batch_sampler = _InfiniteIterableSampler( + dataset, batch_size + ) + else: + self.batch_sampler = BatchSampler( + dataset=dataset, + batch_size=batch_size, + shuffle=shuffle, + drop_last=drop_last, + ) + + self.drop_last = drop_last + self.auto_collate_batch = self.batch_sampler is not None + + self.pin_memory = False + if _non_static_mode(): + self.pin_memory = ( + True if use_pinned_memory() is None else use_pinned_memory() + ) + + self._persistent_workers = persistent_workers + self._iterator = None + self.num_workers = AuToTune(self).__call__() + + def __len__(self): + if self.dataset_kind == _DatasetKind.ITER: + raise ValueError("length of IterableDataset not supported") + else: + if self.auto_collate_batch: + return len(self.batch_sampler) + else: + return len(self.dataset) + + def __iter__(self): + if self.num_workers == 0: + return _DataLoaderIterSingleProcess(self) + elif self._persistent_workers: + if self._iterator is None: + self._iterator = _DataLoaderIterMultiProcess(self) + else: + self._iterator._reset() + return self._iterator + else: + return _DataLoaderIterMultiProcess(self) + + def __call__(self): + return self.__iter__() diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index f8e00eabecf5e..c7e550f7aa117 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -181,9 +181,7 @@ def __init__( not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu() ): - raise NotImplementedError( - "'lr_ratio' is unimplemented in CPU, and NPU" - ) + raise NotImplementedError("'lr_ratio' is unimplemented in CPU.") if parameters is not None: # paddle.Tensor is also iterable, so here we don't check whether diff --git a/python/paddle/static/quantization/post_training_quantization.py b/python/paddle/static/quantization/post_training_quantization.py index f80b293a12d4b..266c1756a334b 100644 --- a/python/paddle/static/quantization/post_training_quantization.py +++ b/python/paddle/static/quantization/post_training_quantization.py @@ -620,7 +620,7 @@ def _load_model_data(self): self._batch_nums if self._batch_nums else len(self._data_loader) ) return - self._data_loader = io.DataLoader.from_generator( + self._data_loader = reader.DataLoader.from_generator( feed_list=feed_vars, capacity=3 * self._batch_size, iterable=True ) if self._sample_generator is not None: diff --git a/python/setup.py.in b/python/setup.py.in index 1c59c4aaa4746..ebf949c2b41f3 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -445,7 +445,6 @@ packages=['paddle', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', 'paddle.fluid.layers', - 'paddle.fluid.dataloader', 'paddle.fluid.contrib', 'paddle.fluid.contrib.extend_optimizer', 'paddle.fluid.incubate', @@ -492,6 +491,7 @@ packages=['paddle', 'paddle.sparse.nn.functional', 'paddle.incubate.xpu', 'paddle.io', + 'paddle.io.dataloader', 'paddle.optimizer', 'paddle.nn', 'paddle.nn.functional', diff --git a/setup.py b/setup.py index 6a305243bbe3d..297e66eba0499 100644 --- a/setup.py +++ b/setup.py @@ -1421,7 +1421,6 @@ def get_setup_parameters(): 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', 'paddle.fluid.layers', - 'paddle.fluid.dataloader', 'paddle.fluid.contrib', 'paddle.fluid.contrib.extend_optimizer', 'paddle.fluid.incubate', @@ -1468,6 +1467,7 @@ def get_setup_parameters(): 'paddle.sparse.nn.functional', 'paddle.incubate.xpu', 'paddle.io', + 'paddle.io.dataloader', 'paddle.optimizer', 'paddle.nn', 'paddle.nn.functional', diff --git a/test/auto_parallel/auto_parallel_relaunch_model.py b/test/auto_parallel/auto_parallel_relaunch_model.py index 290af66485512..6fa3bc9eaa1ff 100644 --- a/test/auto_parallel/auto_parallel_relaunch_model.py +++ b/test/auto_parallel/auto_parallel_relaunch_model.py @@ -109,7 +109,7 @@ def mlp_pretrain_forward(train_program, start_program): error_cost = paddle.nn.functional.square_error_cost(predict, label) loss = paddle.mean(error_cost) - loader = paddle.io.DataLoader.from_generator( + loader = paddle.fluid.io.DataLoader.from_generator( feed_list=[input, label], capacity=4 * batch_size, iterable=True ) diff --git a/test/auto_parallel/engine_api.py b/test/auto_parallel/engine_api.py index cb0b4f2541e94..a2725a57b8e53 100644 --- a/test/auto_parallel/engine_api.py +++ b/test/auto_parallel/engine_api.py @@ -297,7 +297,7 @@ def train_builtin_data_vars(): with static.program_guard(engine.main_program, engine.startup_program): feed_list = engine.inputs + engine.labels print(feed_list) - loader = paddle.io.DataLoader.from_generator( + loader = paddle.fluid.io.DataLoader.from_generator( feed_list=feed_list, capacity=4 * batch_size, iterable=False ) @@ -324,7 +324,7 @@ def train_non_builtin_data_vars(): ) label = static.data(name="label", shape=[batch_size, 1], dtype='int64') - loader = paddle.io.DataLoader.from_generator( + loader = paddle.fluid.io.DataLoader.from_generator( feed_list=[input, label], capacity=4 * batch_size, iterable=False ) places = static.cuda_places() @@ -383,7 +383,7 @@ def get_cost(): ) label = static.data(name="label", shape=[batch_size, 1], dtype='int64') - loader = paddle.io.DataLoader.from_generator( + loader = paddle.fluid.io.DataLoader.from_generator( feed_list=[input, label], capacity=4 * batch_size, iterable=False ) places = static.cuda_places() @@ -434,7 +434,7 @@ def get_cost_by_default_program(): ) label = static.data(name="label", shape=[batch_size, 1], dtype='int64') - loader = paddle.io.DataLoader.from_generator( + loader = paddle.fluid.io.DataLoader.from_generator( feed_list=[input, label], capacity=4 * batch_size, iterable=False ) places = static.cuda_places() diff --git a/test/auto_parallel/test_dist_attr_v2.py b/test/auto_parallel/test_dist_attr_v2.py index 11c140a812a9f..1d15c34221f90 100644 --- a/test/auto_parallel/test_dist_attr_v2.py +++ b/test/auto_parallel/test_dist_attr_v2.py @@ -130,7 +130,7 @@ def get_program(): ) data_holder = [input, label] # dataloader - dataloader = paddle.io.DataLoader.from_generator( + dataloader = paddle.fluid.io.DataLoader.from_generator( feed_list=data_holder, capacity=4 * batch_size, iterable=False ) dataloader.set_batch_generator( diff --git a/test/auto_parallel/test_dist_context.py b/test/auto_parallel/test_dist_context.py index 10f78aedd4fb9..2944b2db2a3fb 100644 --- a/test/auto_parallel/test_dist_context.py +++ b/test/auto_parallel/test_dist_context.py @@ -112,7 +112,7 @@ def get_program(): ) data_holder = [input, label] # dataloader - dataloader = paddle.io.DataLoader.from_generator( + dataloader = paddle.fluid.io.DataLoader.from_generator( feed_list=data_holder, capacity=4 * batch_size, iterable=False ) dataloader.set_batch_generator( diff --git a/test/auto_parallel/test_serialization.py b/test/auto_parallel/test_serialization.py index 00a30e8a61d4e..d89c9596f4cdb 100644 --- a/test/auto_parallel/test_serialization.py +++ b/test/auto_parallel/test_serialization.py @@ -124,7 +124,7 @@ def get_program(): ) data_holder = [input, label] # dataloader - dataloader = paddle.io.DataLoader.from_generator( + dataloader = paddle.fluid.io.DataLoader.from_generator( feed_list=data_holder, capacity=4 * batch_size, iterable=False ) dataloader.set_batch_generator( diff --git a/test/auto_parallel/test_while_op_completion.py b/test/auto_parallel/test_while_op_completion.py index 6d5264ab971b7..3f9b5b151ab08 100644 --- a/test/auto_parallel/test_while_op_completion.py +++ b/test/auto_parallel/test_while_op_completion.py @@ -148,7 +148,7 @@ def get_program(): ) data_holder = [input, label] # dataloader - dataloader = paddle.io.DataLoader.from_generator( + dataloader = paddle.fluid.io.DataLoader.from_generator( feed_list=data_holder, capacity=4 * batch_size, iterable=False ) dataloader.set_batch_generator( diff --git a/test/auto_parallel/test_while_op_partition.py b/test/auto_parallel/test_while_op_partition.py index 6dc02d6834f5b..00f3a70bbcf42 100644 --- a/test/auto_parallel/test_while_op_partition.py +++ b/test/auto_parallel/test_while_op_partition.py @@ -136,7 +136,7 @@ def get_program(): data_holder = [input, label] # dataloader - dataloader = paddle.io.DataLoader.from_generator( + dataloader = fluid.io.DataLoader.from_generator( feed_list=data_holder, capacity=4 * batch_size, iterable=False ) dataloader.set_batch_generator( diff --git a/test/cpp/fluid/CMakeLists.txt b/test/cpp/fluid/CMakeLists.txt index 34f45371cca2a..7bc33dfda33e8 100644 --- a/test/cpp/fluid/CMakeLists.txt +++ b/test/cpp/fluid/CMakeLists.txt @@ -2,9 +2,11 @@ add_subdirectory(benchmark) if(WITH_CINN) add_subdirectory(cinn) endif() -# add_subdirectory(controlflow) -# add_subdirectory(detection) -# add_subdirectory(dlnne) +add_subdirectory(controlflow) +add_subdirectory(detection) +if(WITH_DLNNE) + add_subdirectory(dlnne) +endif() add_subdirectory(elementwise) add_subdirectory(fused) if(WITH_LITE) @@ -21,7 +23,10 @@ endif() add_subdirectory(prim_ops) add_subdirectory(reader) add_subdirectory(reduce_ops) -# add_subdirectory(tensorrt) +# TODO(gouzil): enable this after the bug is fixed. windows: Exit code 0xc000007b, pr: #53470 +# if(WITH_GPU AND TENSORRT_FOUND) +# add_subdirectory(tensorrt) +# endif() set(COMMON_OP_DEPS ${COMMON_OP_DEPS} executor) diff --git a/test/cpp/fluid/controlflow/CMakeLists.txt b/test/cpp/fluid/controlflow/CMakeLists.txt new file mode 100644 index 0000000000000..87950fdcd46f2 --- /dev/null +++ b/test/cpp/fluid/controlflow/CMakeLists.txt @@ -0,0 +1,4 @@ +cc_test( + conditional_block_op_test + SRCS conditional_block_op_test.cc + DEPS conditional_block_op standalone_executor executor) diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_test.cc b/test/cpp/fluid/controlflow/conditional_block_op_test.cc similarity index 100% rename from paddle/fluid/operators/controlflow/conditional_block_op_test.cc rename to test/cpp/fluid/controlflow/conditional_block_op_test.cc diff --git a/test/cpp/fluid/detection/CMakeLists.txt b/test/cpp/fluid/detection/CMakeLists.txt new file mode 100644 index 0000000000000..bc9e8e0a53603 --- /dev/null +++ b/test/cpp/fluid/detection/CMakeLists.txt @@ -0,0 +1,4 @@ +cc_test( + mask_util_test + SRCS mask_util_test.cc + DEPS memory mask_util) diff --git a/paddle/fluid/operators/detection/mask_util_test.cc b/test/cpp/fluid/detection/mask_util_test.cc similarity index 100% rename from paddle/fluid/operators/detection/mask_util_test.cc rename to test/cpp/fluid/detection/mask_util_test.cc diff --git a/test/cpp/fluid/dlnne/CMakeLists.txt b/test/cpp/fluid/dlnne/CMakeLists.txt new file mode 100644 index 0000000000000..5089a41351ce3 --- /dev/null +++ b/test/cpp/fluid/dlnne/CMakeLists.txt @@ -0,0 +1,4 @@ +cc_test( + test_dlnne_engine_op + SRCS dlnne_engine_op_test.cc + DEPS dlnne_engine_op analysis) diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc b/test/cpp/fluid/dlnne/dlnne_engine_op_test.cc similarity index 100% rename from paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc rename to test/cpp/fluid/dlnne/dlnne_engine_op_test.cc diff --git a/test/dygraph_to_static/test_resnet_v2.py b/test/dygraph_to_static/test_resnet_v2.py index bf332809ff8f0..2efbe46cedfec 100644 --- a/test/dygraph_to_static/test_resnet_v2.py +++ b/test/dygraph_to_static/test_resnet_v2.py @@ -255,7 +255,7 @@ def do_train(self, to_static): batch_size=batch_size, drop_last=True, ) - data_loader = paddle.io.DataLoader.from_generator( + data_loader = paddle.fluid.io.DataLoader.from_generator( capacity=5, iterable=True ) data_loader.set_sample_list_generator(train_reader) diff --git a/test/dygraph_to_static/test_simnet_v2.py b/test/dygraph_to_static/test_simnet_v2.py index 3e8cb4c10b3d4..a86259cc6d736 100644 --- a/test/dygraph_to_static/test_simnet_v2.py +++ b/test/dygraph_to_static/test_simnet_v2.py @@ -132,7 +132,7 @@ def train(conf_dict, to_static): global_step = 0 losses = [] - train_loader = paddle.io.DataLoader.from_generator( + train_loader = paddle.fluid.io.DataLoader.from_generator( capacity=16, return_list=True, iterable=True, use_double_buffer=True ) get_train_examples = simnet_process.get_reader("train", epoch=args.epoch) diff --git a/test/ir/inference/test_trt_convert_elementwise.py b/test/ir/inference/test_trt_convert_elementwise.py index a6faff0787be5..0ac4a2ba46209 100644 --- a/test/ir/inference/test_trt_convert_elementwise.py +++ b/test/ir/inference/test_trt_convert_elementwise.py @@ -1214,5 +1214,161 @@ def test(self): self.run_test() +class TrtConvertElementwise0D(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + def generate_input(dims, op_type): + shape = [] + if dims == 0: + shape = [] + elif dims == 1: + shape = [8] + elif dims == 2: + shape = [1, 8] + elif dims == 3: + shape = [1, 8, 8] + else: + shape = [1, 8, 8, 8] + + # elementwise_floordiv is integer only + if op_type == "elementwise_floordiv": + return np.random.randint( + low=1, high=10000, size=shape, dtype=np.int32 + ) + elif op_type == "elementwise_mod": + return np.random.uniform(low=0.1, high=1.0, size=shape).astype( + np.float32 + ) + else: + return np.random.random(shape).astype(np.float32) + + for dims in [[0, 0], [0, 1], [0, 2], [1, 0], [2, 0]]: + for op_type in [ + "elementwise_add", + "elementwise_mul", + "elementwise_sub", + "elementwise_div", + "elementwise_pow", + "elementwise_min", + "elementwise_max", + "elementwise_floordiv", + "elementwise_mod", + ]: + for axis in [-1 if dims[0] == 1 or dims[0] == 0 else 1]: + self.dims = dims[0] + dics = [{"axis": axis}] + ops_config = [ + { + "op_type": op_type, + "op_inputs": { + "X": ["input_data"], + "Y": ["weight"], + }, + "op_outputs": {"Out": ["output_data"]}, + "op_attrs": dics[0], + "outputs_dtype": { + "output_data": np.float32 + if op_type != "elementwise_floordiv" + else np.int32 + }, + } + ] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={ + "weight": TensorConfig( + data_gen=partial( + generate_input, dims[1], op_type + ) + ) + }, + inputs={ + "input_data": TensorConfig( + data_gen=partial( + generate_input, dims[0], op_type + ) + ), + }, + outputs=["output_data"], + ) + + yield program_config + + def sample_predictor_configs( + self, program_config + ) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + # The input.dims[1] must be equal to the weight's length. + if self.dims == 0: + self.dynamic_shape.min_input_shape = {"input_data": []} + self.dynamic_shape.max_input_shape = {"input_data": []} + self.dynamic_shape.opt_input_shape = {"input_data": []} + if self.dims == 1: + self.dynamic_shape.min_input_shape = {"input_data": [1]} + self.dynamic_shape.max_input_shape = {"input_data": [16]} + self.dynamic_shape.opt_input_shape = {"input_data": [8]} + elif self.dims == 2: + self.dynamic_shape.min_input_shape = {"input_data": [1, 8]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 8]} + self.dynamic_shape.opt_input_shape = {"input_data": [2, 8]} + elif self.dims == 3: + self.dynamic_shape.min_input_shape = {"input_data": [1, 1, 4]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 16, 16]} + self.dynamic_shape.opt_input_shape = {"input_data": [2, 8, 8]} + elif self.dims == 4: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 8, 8, 8] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 8, 8, 8] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [4, 8, 8, 8] + } + + def clear_dynamic_shape(): + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + if not dynamic_shape and (self.dims == 1 or self.dims == 0): + return 0, 3 + return 1, 2 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False + ), (1e-5, 1e-5) + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False + ), (1e-3, 1e-3) + + # # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), (1e-5, 1e-5) + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), (1e-3, 1e-3) + + def test(self): + self.run_test() + + if __name__ == "__main__": unittest.main() diff --git a/test/ir/inference/test_trt_convert_equal.py b/test/ir/inference/test_trt_convert_equal.py index 4993e830f190b..5879a003d9546 100644 --- a/test/ir/inference/test_trt_convert_equal.py +++ b/test/ir/inference/test_trt_convert_equal.py @@ -40,54 +40,64 @@ def generate_input(shape): return np.random.random(shape).astype(np.float32) for op_type in ["equal", "not_equal"]: - for batch in [1, 2, 4]: - for shape in [[batch, 1], [batch, 1, 32], [batch, 1, 16, 32]]: - for axis in [-1 if len(shape) == 1 else 1]: - self.dims = len(shape) - dics = [{"axis": axis}, {"in_dtype": 0, "out_dtype": 5}] - ops_config = [ - { - "op_type": op_type, - "op_inputs": { - "X": ["input_data1"], - "Y": ["input_data2"], - }, - "op_outputs": {"Out": ["compare_output_data"]}, - "op_attrs": dics[0], - "outputs_dtype": { - "compare_output_data": np.bool_ - }, + for shape in [[], [1, 1], [1, 1, 32], [1, 1, 16, 32]]: + for axis in [-1 if len(shape) == 1 or len(shape) == 0 else 1]: + self.dims = len(shape) + dics = [{"axis": axis}, {"in_dtype": 0, "out_dtype": 5}] + ops_config = [ + { + "op_type": op_type, + "op_inputs": { + "X": ["input_data1"], + "Y": ["input_data2"], }, - { - "op_type": "cast", - "op_inputs": {"X": ["compare_output_data"]}, - "op_outputs": {"Out": ["output_data"]}, - "op_attrs": dics[1], - "outputs_dtype": {"output_data": np.float32}, - }, - ] - ops = self.generate_op_config(ops_config) - - program_config = ProgramConfig( - ops=ops, - weights={}, - inputs={ - "input_data1": TensorConfig( - data_gen=partial(generate_input, shape) - ), - "input_data2": TensorConfig( - data_gen=partial(generate_input, shape) - ), - }, - outputs=["output_data"], - ) - yield program_config + "op_outputs": {"Out": ["compare_output_data"]}, + "op_attrs": dics[0], + "outputs_dtype": {"compare_output_data": np.bool_}, + }, + { + "op_type": "cast", + "op_inputs": {"X": ["compare_output_data"]}, + "op_outputs": {"Out": ["output_data"]}, + "op_attrs": dics[1], + "outputs_dtype": {"output_data": np.float32}, + }, + ] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data1": TensorConfig( + data_gen=partial(generate_input, shape) + ), + "input_data2": TensorConfig( + data_gen=partial(generate_input, shape) + ), + }, + outputs=["output_data"], + ) + yield program_config def sample_predictor_configs( self, program_config ) -> (paddle_infer.Config, List[int], float): def generate_dynamic_shape(attrs): # The input.dims[1] must be equal to the weight's length. + if self.dims == 0: + self.dynamic_shape.min_input_shape = { + "input_data1": [], + "input_data2": [], + } + self.dynamic_shape.max_input_shape = { + "input_data1": [], + "input_data2": [], + } + self.dynamic_shape.opt_input_shape = { + "input_data1": [], + "input_data2": [], + } if self.dims == 2: self.dynamic_shape.min_input_shape = { "input_data1": [1, 1], diff --git a/test/ir/inference/test_trt_convert_expand_as_v2.py b/test/ir/inference/test_trt_convert_expand_as_v2.py index 46b3a2232e471..be5458cac07dc 100644 --- a/test/ir/inference/test_trt_convert_expand_as_v2.py +++ b/test/ir/inference/test_trt_convert_expand_as_v2.py @@ -49,8 +49,11 @@ def generate_input1(attrs: List[Dict[str, Any]]): elif self.dims == 1: self.input_shape = [32] return np.random.random([32]).astype(np.float32) + elif self.dims == 0: + self.input_shape = [] + return np.random.random([]).astype(np.float32) - for dims in [1, 2, 3, 4]: + for dims in [0, 1, 2, 3, 4]: for shape in [ [10, 8, 32, 32], [2, 8, 32, 32], @@ -125,6 +128,10 @@ def generate_dynamic_shape(attrs): self.dynamic_shape.min_input_shape = {"expand_v2_input": [32]} self.dynamic_shape.max_input_shape = {"expand_v2_input": [64]} self.dynamic_shape.opt_input_shape = {"expand_v2_input": [32]} + elif self.dims == 0: + self.dynamic_shape.min_input_shape = {"expand_v2_input": []} + self.dynamic_shape.max_input_shape = {"expand_v2_input": []} + self.dynamic_shape.opt_input_shape = {"expand_v2_input": []} def clear_dynamic_shape(): self.dynamic_shape.min_input_shape = {} @@ -132,7 +139,9 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if dynamic_shape: + ver = paddle_infer.get_trt_compile_version() + ver_num = ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 + if dynamic_shape and (ver_num > 8000 or self.dims > 0): return 1, 2 else: return 0, 3 diff --git a/test/ir/inference/test_trt_convert_reshape.py b/test/ir/inference/test_trt_convert_reshape.py index 3f88b39003bb9..c30d973651bad 100644 --- a/test/ir/inference/test_trt_convert_reshape.py +++ b/test/ir/inference/test_trt_convert_reshape.py @@ -431,5 +431,99 @@ def test(self): self.run_test() +class TrtConvertReshapeZeroDimsTest(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + def generate_input1(attrs: List[Dict[str, Any]]): + if self.dims > 0: + self.input_shape = [1] * self.dims + return np.random.random(self.input_shape).astype(np.float32) + elif self.dims == 0: + self.input_shape = [] + return np.random.random([]).astype(np.float32) + + for dims in [0, 1, 2, 3]: + for shape in [ + [], + [1, 1], + ]: + dics = [ + { + "shape": shape, + }, + ] + self.dims = dims + dics_intput = [{"X": ["reshape_input"]}] + + ops_config = [ + { + "op_type": "reshape", + "op_inputs": dics_intput[0], + "op_outputs": {"Out": ["reshape_out"]}, + "op_attrs": dics[0], + } + ] + ops = self.generate_op_config(ops_config) + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "reshape_input": TensorConfig( + data_gen=partial(generate_input1, dics) + ) + }, + outputs=["reshape_out"], + ) + + yield program_config + + def sample_predictor_configs( + self, program_config + ) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + self.dynamic_shape.min_input_shape = { + "reshape_input": self.input_shape + } + self.dynamic_shape.max_input_shape = { + "reshape_input": self.input_shape + } + self.dynamic_shape.opt_input_shape = { + "reshape_input": self.input_shape + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + # only test dynamic shape mode + return 1, 2 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), 1e-3 + + def add_skip_trt_case(self): + pass + + def test(self): + self.add_skip_trt_case() + self.run_test() + + if __name__ == "__main__": unittest.main() diff --git a/test/legacy_test/test_model.py b/test/legacy_test/test_model.py index af3cb7bdefb74..c5d62761f7f68 100644 --- a/test/legacy_test/test_model.py +++ b/test/legacy_test/test_model.py @@ -199,13 +199,13 @@ def setUpClass(cls): mode='test', return_label=False, sample_num=sp_num ) - cls.train_loader = fluid.io.DataLoader( + cls.train_loader = paddle.io.DataLoader( cls.train_dataset, places=cls.device, batch_size=64 ) - cls.val_loader = fluid.io.DataLoader( + cls.val_loader = paddle.io.DataLoader( cls.val_dataset, places=cls.device, batch_size=64 ) - cls.test_loader = fluid.io.DataLoader( + cls.test_loader = paddle.io.DataLoader( cls.test_dataset, places=cls.device, batch_size=64 ) @@ -322,14 +322,14 @@ def fit(self, dynamic, num_replicas=None, rank=None, num_iters=None): rank=rank, ) - train_loader = fluid.io.DataLoader( + train_loader = paddle.io.DataLoader( self.train_dataset, batch_sampler=train_sampler, places=self.device, return_list=True, ) - val_loader = fluid.io.DataLoader( + val_loader = paddle.io.DataLoader( self.val_dataset, batch_sampler=val_sampler, places=self.device, @@ -375,14 +375,14 @@ def fit_with_tuple_input(self, dynamic, num_replicas=None, rank=None): rank=rank, ) - train_loader = fluid.io.DataLoader( + train_loader = paddle.io.DataLoader( self.train_dataset, batch_sampler=train_sampler, places=self.device, return_list=True, ) - val_loader = fluid.io.DataLoader( + val_loader = paddle.io.DataLoader( self.val_dataset, batch_sampler=val_sampler, places=self.device, @@ -404,7 +404,7 @@ def evaluate(self, dynamic): self.val_dataset, batch_size=64, shuffle=False ) - val_loader = fluid.io.DataLoader( + val_loader = paddle.io.DataLoader( self.val_dataset, batch_sampler=sampler, places=self.device, @@ -432,7 +432,7 @@ def predict(self, dynamic): self.test_dataset, batch_size=64, shuffle=False ) - test_loader = fluid.io.DataLoader( + test_loader = paddle.io.DataLoader( self.test_dataset, batch_sampler=sampler, places=self.device, diff --git a/test/prim/prim/vjp/test_comp_high_grad.py b/test/prim/prim/vjp/test_comp_high_grad.py index 76283528e2404..87b3c8f300ecd 100644 --- a/test/prim/prim/vjp/test_comp_high_grad.py +++ b/test/prim/prim/vjp/test_comp_high_grad.py @@ -226,6 +226,7 @@ def test_high_grad(self): self.func_triple(p) +''' @param.parameterized_class( ('shape1', 'shape2'), [ @@ -328,7 +329,6 @@ def test_high_grad(self): for p in places: self.func_double(p) self.func_triple(p) - - +''' if __name__ == '__main__': unittest.main() diff --git a/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py b/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py new file mode 100644 index 0000000000000..e4b290a2e66e1 --- /dev/null +++ b/test/xpu/test_depthwise_conv2d_transpose_op_xpu.py @@ -0,0 +1,298 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +from get_test_cover_info import ( + XPUOpTestWrapper, + create_test_class, + get_xpu_op_support_types, +) +from op_test_xpu import XPUOpTest + +import paddle + +paddle.enable_static() + + +def depthwiseconv2dtranspose_forward_naive(input_, filter_, attrs): + padding_algorithm = attrs['padding_algorithm'] + if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]: + raise ValueError( + "Unknown Attr(padding_algorithm): '%s'. " + "It can only be 'SAME' or 'VALID'." % str(padding_algorithm) + ) + + if attrs['data_format'] == 'NHWC': + input_ = np.transpose(input_, [0, 3, 1, 2]) + in_n, in_c, in_h, in_w = input_.shape + f_c, f_out_c, f_h, f_w = filter_.shape + groups = attrs['groups'] + assert in_c == f_c + out_c = f_out_c * groups + sub_in_c = in_c // groups + + stride, pad, dilations = ( + attrs['strides'], + attrs['paddings'], + attrs['dilations'], + ) + + # update pad and dilation + def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride): + padding = [] + for input_size, filter_size, stride_size in zip( + input_shape, kernel_size, kernel_stride + ): + out_size = int((input_size + stride_size - 1) / stride_size) + pad_sum = np.max( + ((out_size - 1) * stride_size + filter_size - input_size, 0) + ) + pad_0 = int(pad_sum / 2) + pad_1 = int(pad_sum - pad_0) + padding.append(pad_0) + padding.append(pad_1) + return padding + + ksize = filter_.shape[2:4] + if padding_algorithm == "VALID": + pad = [0, 0, 0, 0] + elif padding_algorithm == "SAME": + dilations = [1, 1] + input_data_shape = input_.shape[2:4] + pad = _get_padding_with_SAME(input_data_shape, ksize, stride) + + pad_h_0, pad_h_1 = pad[0], pad[0] + pad_w_0, pad_w_1 = pad[1], pad[1] + if len(pad) == 4: + pad_h_0, pad_h_1 = pad[0], pad[1] + pad_w_0, pad_w_1 = pad[2], pad[3] + + d_bolck_h = dilations[0] * (f_h - 1) + 1 + d_bolck_w = dilations[1] * (f_w - 1) + 1 + out_h = (in_h - 1) * stride[0] + d_bolck_h + out_w = (in_w - 1) * stride[1] + d_bolck_w + if 'output_size' in attrs: + output_size = attrs['output_size'] + out_h = output_size[0] + pad_h_0 + pad_h_1 + out_w = output_size[1] + pad_w_0 + pad_w_1 + out_pad_h = 0 + out_pad_w = 0 + if 'output_padding' in attrs: + out_pad_h = attrs['output_padding'][0] + out_pad_w = attrs['output_padding'][1] + out = np.zeros( + (in_n, out_c, out_h + out_pad_h, out_w + out_pad_w), dtype=input_.dtype + ) + + for n in range(in_n): + for i in range(in_h): + for j in range(in_w): + for g in range(groups): + input_masked = input_[ + n, g * sub_in_c : (g + 1) * sub_in_c, i, j + ] # (c) + input_masked = np.reshape(input_masked, (sub_in_c, 1, 1)) + input_masked = np.tile(input_masked, (1, f_h, f_w)) + + for k in range(f_out_c): + tmp_out = np.sum( + input_masked + * filter_[ + g * sub_in_c : (g + 1) * sub_in_c, k, :, : + ], + axis=0, + ) + i1, i2 = i * stride[0], i * stride[0] + d_bolck_h + j1, j2 = j * stride[1], j * stride[1] + d_bolck_w + out[ + n, + g * f_out_c + k, + i1 : i2 : dilations[0], + j1 : j2 : dilations[1], + ] += tmp_out + + out = out[ + :, + :, + pad_h_0 : out_h - pad_h_1 + out_pad_h, + pad_w_0 : out_w - pad_w_1 + out_pad_w, + ] + if attrs['data_format'] == 'NHWC': + out = np.transpose(out, [0, 2, 3, 1]) + return out + + +class XPUTestDepthwiseConv2DTransposeOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'depthwise_conv2d_transpose' + self.use_dynamic_create_class = False + + class TestDepthwiseConv2DTransposeOp(XPUOpTest): + def setUp(self): + # init as conv transpose + self.need_check_grad = True + self.is_test = False + self.use_cudnn = False + self.use_mkldnn = False + self.output_size = None + self.output_padding = [] + self.data_format = "NCHW" + self.pad = [0, 0] + self.padding_algorithm = "EXPLICIT" + self.init_op_type() + self.init_test_case() + self.__class__.op_type = "depthwise_conv2d_transpose" + + input_ = np.random.random(self.input_size).astype(self.dtype) + filter_ = np.random.random(self.filter_size).astype(self.dtype) + + self.inputs = {'Input': input_, 'Filter': filter_} + self.attrs = { + 'strides': self.stride, + 'paddings': self.pad, + 'padding_algorithm': self.padding_algorithm, + 'groups': self.groups, + 'dilations': self.dilations, + 'use_cudnn': self.use_cudnn, + 'is_test': self.is_test, + 'use_mkldnn': self.use_mkldnn, + 'data_format': self.data_format, + } + if self.output_size is not None: + self.attrs['output_size'] = self.output_size + + if len(self.output_padding) > 0: + self.attrs['output_padding'] = self.output_padding + + output = depthwiseconv2dtranspose_forward_naive( + input_, filter_, self.attrs + ).astype(self.dtype) + + self.outputs = {'Output': output} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_no_input(self): + if self.need_check_grad: + self.check_grad_with_place( + self.place, ['Filter'], 'Output', no_grad_set={'Input'} + ) + + def test_check_grad_no_filter(self): + if self.need_check_grad: + self.check_grad_with_place( + self.place, ['Input'], 'Output', no_grad_set={'Filter'} + ) + + def test_check_grad(self): + if self.need_check_grad: + self.check_grad_with_place( + self.place, {'Input', 'Filter'}, 'Output' + ) + + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.dilations = [1, 1] + self.groups = 1 + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + + def init_op_type(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.op_type = "depthwise_conv2d_transpose" + + class TestWithSymmetricPad(TestDepthwiseConv2DTransposeOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.dilations = [1, 1] + self.groups = 1 + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + + class TestWithAsymmetricPad(TestDepthwiseConv2DTransposeOp): + def init_test_case(self): + self.pad = [1, 0, 1, 2] + self.stride = [1, 1] + self.dilations = [1, 1] + self.groups = 1 + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + + class TestWithSAMEPad(TestDepthwiseConv2DTransposeOp): + def init_test_case(self): + self.stride = [2, 1] + self.dilations = [1, 2] + self.groups = 1 + self.input_size = [2, 3, 6, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 4, 3] + self.padding_algorithm = 'SAME' + + class TestWithVALIDPad(TestDepthwiseConv2DTransposeOp): + def init_test_case(self): + self.stride = [1, 1] + self.dilations = [1, 1] + self.groups = 1 + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + self.padding_algorithm = 'VALID' + + class TestWithGroups(TestDepthwiseConv2DTransposeOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.dilations = [1, 1] + self.groups = 2 + self.input_size = [2, 4, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 3, 3, 3] + + class TestWithStride(TestDepthwiseConv2DTransposeOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.dilations = [1, 1] + self.groups = 1 + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + + class TestWithEvenUpsample(TestDepthwiseConv2DTransposeOp): + def init_test_case(self): + self.pad = [2, 2] + self.stride = [2, 2] + self.groups = 1 + self.dilations = [1, 1] + self.output_size = [14, 14] + self.input_size = [2, 3, 7, 7] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 5, 5] + + +support_types = get_xpu_op_support_types('depthwise_conv2d_transpose') +for stype in support_types: + create_test_class(globals(), XPUTestDepthwiseConv2DTransposeOp, stype) + +if __name__ == '__main__': + unittest.main() diff --git a/test/xpu/test_pad_op_xpu.py b/test/xpu/test_pad_op_xpu.py new file mode 100644 index 0000000000000..4f4d68ab73d0e --- /dev/null +++ b/test/xpu/test_pad_op_xpu.py @@ -0,0 +1,214 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest + +import numpy as np +from get_test_cover_info import ( + XPUOpTestWrapper, + create_test_class, + get_xpu_op_support_types, +) +from op_test_xpu import XPUOpTest +from test_attribute_var import UnittestBase + +import paddle +from paddle.fluid import Program, program_guard + + +def pad_wrapper(x, paddings, pad_value): + return paddle.nn.functional.pad( + x, pad=list(paddings), mode='constant', value=pad_value + ) + + +paddle.enable_static() + + +class XPUTestPadOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = "pad" + self.use_dynamic_create_class = False + + class TestPadOp(XPUOpTest): + def setUp(self): + self.op_type = "pad" + self.place = paddle.XPUPlace(0) + self.python_api = pad_wrapper + self.public_python_api = pad_wrapper + self.init_dtype() + self.init_test_case() + self.init_data() + + def init_dtype(self): + self.dtype = self.in_type + + def init_test_case(self): + self.shape = (16, 16) + self.paddings = [(0, 1), (2, 3)] + self.pad_value = 0.0 + + def init_data(self): + self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)} + self.outputs = { + 'Out': np.pad( + self.inputs['X'], + self.paddings, + mode='constant', + constant_values=self.pad_value, + ) + } + self.attrs = { + 'paddings': list(np.array(self.paddings).flatten()), + 'pad_value': self.pad_value, + } + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + class TestCase1(TestPadOp): + def init_test_case(self): + self.shape = (2, 3, 4, 5) + self.paddings = [(0, 1), (2, 3), (2, 1), (1, 1)] + self.pad_value = 0.5 + + class TestCase2(TestPadOp): + def init_test_case(self): + self.shape = (5, 5, 5) + self.paddings = [(0, 0), (0, 0), (1, 2)] + self.pad_value = 1.0 + + class TestCase3(TestPadOp): + def init_test_case(self): + self.shape = 100 + self.paddings = [(0, 1)] + self.pad_value = 0.9 + + class TestPadOpError(unittest.TestCase): + def test_errors(self): + with paddle.fluid.framework._static_guard(): + with program_guard(Program(), Program()): + input_data = np.random.random((2, 2)).astype("float32") + + def test_Variable(): + paddle.nn.functional.pad(x=input_data, pad=[1, 1, 1, 1]) + + self.assertRaises(TypeError, test_Variable) + + data = paddle.static.data( + name='data', shape=[4], dtype='float16' + ) + paddle.nn.functional.pad(x=data, pad=[0, 1]) + + class TestPaddingValueTensor(UnittestBase): + def init_info(self): + self.shapes = [[2, 4]] + self.save_path = os.path.join( + self.temp_dir.name, self.path_prefix() + ) + + def test_static(self): + with paddle.fluid.framework._static_guard(): + main_prog = Program() + starup_prog = Program() + with program_guard(main_prog, starup_prog): + fc = paddle.nn.Linear(4, 10) + x = paddle.randn([2, 4]) + x.stop_gradient = False + feat = fc(x) # [2,3,10] + + out = self.call_func(feat) + + sgd = paddle.optimizer.SGD() + sgd.minimize(paddle.mean(out)) + self.assertTrue(self.var_prefix() in str(main_prog)) + exe = paddle.static.Executor(paddle.XPUPlace(0)) + exe.run(starup_prog) + res = exe.run(fetch_list=[feat, out]) + gt = np.pad( + res[0], [1, 1], 'constant', constant_values=[1.0, 1.0] + ) + np.testing.assert_allclose(res[1], gt) + paddle.static.save_inference_model( + self.save_path, [x], [feat, out], exe + ) + # Test for Inference Predictor + infer_outs = self.infer_prog() + gt = np.pad( + infer_outs[0], + [1, 1], + 'constant', + constant_values=[1.0, 1.0], + ) + np.testing.assert_allclose(infer_outs[1], gt) + + def path_prefix(self): + return 'padding_value' + + def var_prefix(self): + return "Var[" + + def call_func(self, x): + padding_value = paddle.assign([1.0]) + out = paddle.nn.functional.pad( + x, pad=[1, 1, 1, 1], value=padding_value, mode='constant' + ) + return out + + class TestPaddingValueTensor2(TestPaddingValueTensor): + def call_func(self, x): + padding_value = paddle.assign([1.0]) + # test for int value + tmp = paddle.nn.functional.pad(x, pad=[1, 1, 1, 1], value=1) + out = paddle.nn.functional.pad( + x, pad=[1, 1, 1, 1], value=padding_value + ) + return out + + class TestPaddingValueTensor3(unittest.TestCase): + def test_static(self): + with paddle.fluid.framework._static_guard(): + np_x = np.random.random((16, 16)).astype('float32') + main_prog = Program() + starup_prog = Program() + with program_guard(main_prog, starup_prog): + x = paddle.assign(np_x).astype('float32') + pad_value = paddle.assign([0.0]).astype('float64') + y = paddle.nn.functional.pad( + x, [0, 1, 2, 3], value=pad_value + ) + loss = y.sum() + optimize_ops, params_grads = paddle.optimizer.SGD( + 0.01 + ).minimize(loss) + + exe = paddle.static.Executor(paddle.XPUPlace(0)) + res = exe.run( + main_prog, fetch_list=[y] + [g for p, g in params_grads] + ) + pd_out = res[0] + np_out = np.pad(np_x, [(0, 1), (2, 3)], constant_values=0.0) + np.testing.assert_allclose(pd_out, np_out) + + +support_types = get_xpu_op_support_types("pad") +for stype in support_types: + create_test_class(globals(), XPUTestPadOp, stype) + +if __name__ == "__main__": + unittest.main() diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py index 8bbe39b3b7659..4da57036b68c8 100644 --- a/tools/get_pr_ut.py +++ b/tools/get_pr_ut.py @@ -340,10 +340,8 @@ def get_pr_ut(self): file_list.append(filename) else: filterFiles.append(filename) - elif ( - ('/xpu/' in filename.lower()) - or ('/npu/' in filename.lower()) - or ('/ipu/' in filename.lower()) + elif ('/xpu/' in filename.lower()) or ( + '/ipu/' in filename.lower() ): filterFiles.append(filename) else: diff --git a/tools/xpu/check_xpu_dependence.sh b/tools/xpu/check_xpu_dependence.sh new file mode 100644 index 0000000000000..abfea14330819 --- /dev/null +++ b/tools/xpu/check_xpu_dependence.sh @@ -0,0 +1,120 @@ +#!/bin/bash + +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -u + +if [[ $# -ne 2 ]]; then + echo "usage: ./check_xpu_dependence.sh XPU_BASE_URL XPU_XCCL_BASE_URL" + exit 1 +fi + +xpu_base_url=$1 +xccl_base_url=$2 + +echo "xpu_base_url: $xpu_base_url" +echo "xccl_base_url: $xccl_base_url" + +function check_files() { + local url="$1" + local local_dir="$2" + echo "local dir: $local_dir" + local local_file_name="${local_dir}.tar.gz" + echo "local file name: $local_file_name" + + shift + shift + local files=("$@") + + # start to download + echo "downloading: $url" + rm -f ./$local_file_name + wget -q $url -O ${local_file_name} + if [[ $? -ne 0 ]]; then + echo "downloading failed: $url" + return 1 + else + echo "downloading ok: $url" + fi + + # remove local dir and de-compress + rm -rf ./$local_dir + tar xf $local_file_name + if [[ $? -ne 0 ]]; then + echo "de-compress failed: $local_file_name" + return 1 + fi + + for i in "${files[@]}"; + do + echo "checking $local_dir/$i" + if [[ ! -f $local_dir/$i ]]; then + echo "checking failed: $local_dir/$i" + return 1 + else + echo "checking ok: $local_dir/$i" + fi + done + + # clean + rm -f ./$local_file_name + rm -rf ./$local_dir +} + +# XRE +xre_tar_file_names=("xre-kylin_aarch64" "xre-bdcentos_x86_64" "xre-ubuntu_x86_64" "xre-centos7_x86_64") +xre_inner_file_names=("include/xpu/runtime.h" "so/libxpurt.so") +for name in ${xre_tar_file_names[@]}; do + url="${xpu_base_url}/${name}.tar.gz" + check_files $url $name "${xre_inner_file_names[@]}" + if [[ $? -ne 0 ]]; then + echo "XRE check failed, name: $name" + exit 1 + else + echo "XRE check ok, name: $name" + fi +done + +# XDNN +xdnn_tar_file_names=("xdnn-kylin_aarch64" "xdnn-bdcentos_x86_64" "xdnn-ubuntu_x86_64" "xdnn-centos7_x86_64") +xdnn_inner_file_names=("include/xpu/xdnn.h" "so/libxpuapi.so") +for name in ${xdnn_tar_file_names[@]}; do + url="${xpu_base_url}/${name}.tar.gz" + check_files $url $name "${xdnn_inner_file_names[@]}" + if [[ $? -ne 0 ]]; then + echo "XDNN check failed, name: $name" + exit 1 + else + echo "XDNN check ok, name: $name" + fi +done + +# XCCL +xccl_tar_file_names=("xccl_rdma-bdcentos_x86_64" "xccl_rdma-ubuntu_x86_64" "xccl_socket-bdcentos_x86_64" "xccl_socket-deepin_sw6_64" "xccl_socket-kylin_aarch64" "xccl_socket-ubuntu_x86_64") +xccl_inner_file_names=("include/bkcl.h" "so/libbkcl.so") +for name in ${xccl_tar_file_names[@]}; do + url="${xccl_base_url}/${name}.tar.gz" + check_files $url $name "${xccl_inner_file_names[@]}" + if [[ $? -ne 0 ]]; then + echo "XCCL check failed, name: $name" + exit 1 + else + echo "XCCL check ok, name: $name" + fi +done + +echo "ALL CHECKS PASSED" + +exit 0