From e912978a9a4df6ba9cba28fa56fc9550969e927f Mon Sep 17 00:00:00 2001 From: wwbitejotunn Date: Fri, 5 Aug 2022 11:19:04 +0000 Subject: [PATCH 1/4] add folding for biasqk2 --- paddle/fluid/framework/var_desc.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc index 954f766611bcd..5d6ad0a1a5a94 100644 --- a/paddle/fluid/framework/var_desc.cc +++ b/paddle/fluid/framework/var_desc.cc @@ -79,7 +79,7 @@ void VarDesc::SetShapes( } std::vector VarDesc::GetShape() const { - // VLOG(0)<<"@@@ VarDesc::GetShape()"< VarDesc::GetLoDLevels() const { } const proto::VarType::TensorDesc &VarDesc::tensor_desc() const { - // VLOG(0)<<"@@@ tensor name: "<Name(); + // VLOG(1)<<"@@@ tensor name: "<Name(); PADDLE_ENFORCE_EQ( desc_.has_type(), true, From a98f48cdebe53f9fc2d937ffc85a81e015516f2f Mon Sep 17 00:00:00 2001 From: wwbitejotunn Date: Wed, 10 Aug 2022 10:55:37 +0000 Subject: [PATCH 2/4] merge develop --- cmake/external/cinn.cmake | 7 +- cmake/external/xpu.cmake | 4 +- cmake/operators.cmake | 2 +- paddle/fluid/distributed/CMakeLists.txt | 1 + .../distributed/auto_parallel/CMakeLists.txt | 37 + .../auto_parallel/auto_parallel.proto | 120 ++ .../distributed/auto_parallel/device_mesh.cc | 398 +++++++ .../distributed/auto_parallel/device_mesh.h | 273 +++++ .../auto_parallel/device_mesh_test.cc | 93 ++ .../distributed/auto_parallel/dist_attr.cc | 533 +++++++++ .../distributed/auto_parallel/dist_attr.h | 239 ++++ .../auto_parallel/dist_attr_test.cc | 142 +++ .../distributed/auto_parallel/dist_mapper.cc | 146 +++ .../distributed/auto_parallel/dist_mapper.h | 73 ++ .../auto_parallel/dist_mapper_test.cc | 72 ++ .../distributed/auto_parallel/process_mesh.cc | 134 +++ .../distributed/auto_parallel/process_mesh.h | 94 ++ .../auto_parallel/process_mesh_test.cc | 53 + .../fluid/distributed/auto_parallel/utils.h | 114 ++ .../collective/ProcessGroupHCCL.cc | 11 - .../collective/ProcessGroupNCCL.cc | 15 +- .../distributed/fleet_executor/dist_model.cc | 20 +- .../distributed/ps/table/depends/dense.h | 6 +- .../distributed/ps/table/sparse_accessor.cc | 7 +- paddle/fluid/distributed/the_one_ps.proto | 2 +- .../final_state_generator/python_c_gen.py | 46 +- paddle/fluid/eager/backward.cc | 21 +- paddle/fluid/eager/nan_inf_utils.cc | 2 + paddle/fluid/eager/nan_inf_utils.h | 21 + paddle/fluid/framework/CMakeLists.txt | 20 +- paddle/fluid/framework/attribute.cc | 4 + paddle/fluid/framework/attribute.h | 31 +- paddle/fluid/framework/attribute_test.cc | 20 + paddle/fluid/framework/block_desc.cc | 68 +- paddle/fluid/framework/block_desc.h | 5 +- .../framework/details/nan_inf_utils_detail.h | 5 +- paddle/fluid/framework/dlpack_tensor_test.cc | 2 + paddle/fluid/framework/framework.proto | 4 + .../inference_cached_ops.h} | 22 +- paddle/fluid/framework/infershape_utils.cc | 9 +- paddle/fluid/framework/ir/CMakeLists.txt | 14 +- .../ir/gpu_cpu_map_matmul_to_mul_pass.cc | 2 +- paddle/fluid/framework/ir/graph.cc | 2 +- .../framework/ir/graph_pattern_detector.cc | 27 + .../framework/ir/graph_pattern_detector.h | 15 + .../conv_elementwise_add_mkldnn_fuse_pass.cc | 7 + .../conv_elementwise_add_mkldnn_fuse_pass.h | 3 + .../framework/ir/mkldnn/cpu_bfloat16_pass.cc | 7 +- .../matmul_activation_mkldnn_fuse_pass.cc | 33 +- ...matmul_elementwise_add_mkldnn_fuse_pass.cc | 157 +++ ...matmul_elementwise_add_mkldnn_fuse_pass.h} | 23 +- ...mul_transpose_reshape_mkldnn_fuse_pass.cc} | 183 +-- ...tmul_transpose_reshape_mkldnn_fuse_pass.h} | 11 +- ...nspose_reshape_mkldnn_fuse_pass_tester.cc} | 7 +- .../ir/mkldnn/mkldnn_inplace_pass_tester.cc | 5 +- ...shape_transpose_matmul_mkldnn_fuse_pass.cc | 183 +-- ...eshape_transpose_matmul_mkldnn_fuse_pass.h | 14 +- ...ranspose_matmul_mkldnn_fuse_pass_tester.cc | 6 +- .../framework/ir/swin_attention1_fuse_pass.cc | 8 + .../framework/new_executor/CMakeLists.txt | 7 +- .../garbage_collector/CMakeLists.txt | 21 +- .../event_garbage_collector.cc | 103 +- .../event_garbage_collector.h | 23 +- .../fast_garbage_collector.cc | 9 +- .../fast_garbage_collector.h | 10 +- .../garbage_collector/garbage_collector.cc | 33 +- .../garbage_collector/garbage_collector.h | 15 +- .../no_event_garbage_collector.cc | 106 ++ .../no_event_garbage_collector.h | 39 + .../framework/new_executor/interpretercore.cc | 71 +- .../framework/new_executor/interpretercore.h | 1 - paddle/fluid/framework/op_desc.cc | 179 ++- paddle/fluid/framework/op_desc.h | 40 +- paddle/fluid/framework/op_proto_maker.h | 4 +- paddle/fluid/framework/operator.cc | 190 ++- paddle/fluid/framework/operator.h | 1 + .../framework/paddle2cinn/build_cinn_pass.cc | 16 +- paddle/fluid/framework/program_desc.cc | 45 +- paddle/fluid/framework/program_desc.h | 2 + paddle/fluid/framework/prune.cc | 36 +- paddle/fluid/framework/tensor.h | 50 +- paddle/fluid/framework/tensor_test.cc | 1 + paddle/fluid/framework/tensor_util.h | 21 + paddle/fluid/framework/type_defs.h | 7 +- paddle/fluid/framework/var_desc.h | 45 +- paddle/fluid/imperative/reducer.cc | 2 +- .../fluid/imperative/tests/test_prepare_op.cc | 2 +- paddle/fluid/inference/analysis/argument.h | 2 +- .../inference/analysis/ir_pass_manager.cc | 3 +- .../ir_passes/tensorrt_subgraph_pass.cc | 5 +- .../passes/convert_to_mixed_precision.cc | 530 +++++---- paddle/fluid/inference/api/analysis_config.cc | 6 +- .../fluid/inference/api/analysis_predictor.cc | 2 + .../inference/api/paddle_analysis_config.h | 4 +- .../inference/api/paddle_pass_builder.cc | 18 +- paddle/fluid/inference/capi/paddle_c_api.h | 2 +- paddle/fluid/inference/capi/pd_config.cc | 2 +- paddle/fluid/inference/capi_exp/pd_config.cc | 2 +- paddle/fluid/inference/capi_exp/pd_config.h | 2 +- .../inference/tensorrt/convert/CMakeLists.txt | 2 + .../fill_constant_batch_size_like_op.cc | 86 ++ .../tensorrt/convert/multihead_matmul_op.cc | 4 + .../inference/tensorrt/convert/rnn_op.cc | 320 ++++++ .../inference/tensorrt/convert/ut_helper.h | 2 +- paddle/fluid/inference/tensorrt/engine.h | 6 +- paddle/fluid/inference/tensorrt/op_teller.cc | 61 + .../tensorrt/plugin/gather_nd_op_plugin.cu | 1 + .../tensorrt/plugin/qkv_to_context_plugin.cu | 69 ++ .../tests/api/analyzer_rnn1_tester.cc | 2 - paddle/fluid/jit/CMakeLists.txt | 16 +- paddle/fluid/jit/all.h | 8 +- paddle/fluid/jit/ast.h | 59 - paddle/fluid/jit/compilation_unit.cc | 28 +- paddle/fluid/jit/compilation_unit.h | 17 +- paddle/fluid/jit/engine/CMakeLists.txt | 9 + .../{base_function.h => engine/base_engine.h} | 4 +- paddle/fluid/jit/engine/executor_engine.cc | 63 + paddle/fluid/jit/engine/executor_engine.h | 51 + paddle/fluid/jit/engine/pe_engine.cc | 118 ++ paddle/fluid/jit/engine/pe_engine.h | 67 ++ paddle/fluid/jit/executor_function.h | 80 -- paddle/fluid/jit/function.cc | 43 + .../function.h} | 33 +- paddle/fluid/jit/function_utils.cc | 11 +- paddle/fluid/jit/function_utils.h | 11 +- paddle/fluid/jit/layer.cc | 54 +- paddle/fluid/jit/layer.h | 32 +- paddle/fluid/jit/layer_test.cc | 5 +- paddle/fluid/jit/object.h | 66 -- paddle/fluid/jit/pe_function.h | 144 --- paddle/fluid/jit/serializer.cc | 50 +- paddle/fluid/jit/serializer.h | 7 +- paddle/fluid/operators/CMakeLists.txt | 18 +- paddle/fluid/operators/activation_op_mlu.cc | 8 +- paddle/fluid/operators/batch_norm_op_mlu.cc | 2 +- .../fluid/operators/class_center_sample_op.cc | 40 +- .../fluid/operators/class_center_sample_op.cu | 611 ---------- .../fluid/operators/class_center_sample_op.h | 119 -- .../fluid/operators/conv_transpose_op_mlu.cc | 32 +- .../fluid/operators/detection/CMakeLists.txt | 7 +- .../detection/distribute_fpn_proposals_op.cc | 2 +- .../detection/generate_proposals_op.cc | 5 +- .../detection/generate_proposals_v2_op.cc | 247 +--- .../detection/generate_proposals_v2_op.cu | 277 ----- .../detection/locality_aware_nms_op.cc | 31 +- .../operators/detection/matrix_nms_op.cc | 4 +- .../operators/detection/multiclass_nms_op.cc | 22 +- .../fluid/operators/detection/prior_box_op.cu | 195 ---- paddle/fluid/operators/erfinv_op.cc | 89 -- paddle/fluid/operators/fill_any_op.cc | 56 +- paddle/fluid/operators/fill_any_op.cu.cc | 34 - paddle/fluid/operators/fill_any_op.h | 67 -- .../operators/fill_diagonal_tensor_op.cc | 221 +--- .../operators/fill_diagonal_tensor_op.cu | 231 ---- paddle/fluid/operators/fold_op.cc | 253 +--- paddle/fluid/operators/fold_op.cu | 25 - paddle/fluid/operators/fold_op.h | 140 --- .../fused/fused_multi_transformer_op.cu | 102 +- .../operators/fused/fused_softmax_mask.cu.h | 12 +- .../operators/fused/multihead_matmul_op.cu | 42 +- .../fluid/operators/gather_scatter_kernel.h | 2 + .../operators/graph_sample_neighbors_op.cc | 2 +- paddle/fluid/operators/graph_send_recv_op.cc | 12 +- .../operators/margin_cross_entropy_op.cc | 99 +- .../operators/margin_cross_entropy_op.cu | 618 ---------- .../fluid/operators/margin_cross_entropy_op.h | 40 - .../operators/math/bert_encoder_functor.cu | 3 +- paddle/fluid/operators/math/cross_entropy.h | 1 + paddle/fluid/operators/math/sample_prob.h | 1 + paddle/fluid/operators/math/softmax_impl.h | 4 + paddle/fluid/operators/math/unpooling.h | 1 + paddle/fluid/operators/math/vol2col.cc | 34 +- .../operators/mkldnn/activation_mkldnn_op.cc | 100 +- .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 92 +- .../mkldnn/conv_transpose_mkldnn_op.cc | 21 +- .../operators/mkldnn/matmul_v2_mkldnn_op.cc | 336 +++--- .../fluid/operators/mkldnn/mul_mkldnn_op.cc | 20 +- .../operators/mkldnn/test_mkldnn_caching.cc | 3 +- .../mkldnn/test_mkldnn_op_inplace.cc | 2 +- .../operators/mkldnn/test_mkldnn_op_nhwc.cc | 2 +- .../distributed_fused_lamb_init_op.cu | 5 +- .../optimizers/distributed_fused_lamb_op.cu | 110 +- paddle/fluid/operators/optimizers/lamb_op.cc | 145 +-- paddle/fluid/operators/optimizers/lamb_op.h | 813 ------------- .../fluid/operators/optimizers/lamb_op_xpu.cc | 4 +- .../optimizers/merged_momentum_op_xpu.cc | 141 +++ paddle/fluid/operators/pool_op_xpu.cc | 30 +- .../fluid/operators/prim_ops/CMakeLists.txt | 3 +- paddle/fluid/operators/prim_ops/log_p_op.cc | 75 ++ .../fluid/operators/prim_ops/prim_op_test.cc | 20 + paddle/fluid/operators/prim_ops/split_p_op.cc | 2 +- paddle/fluid/operators/spectral_helper.h | 545 --------- paddle/fluid/operators/spectral_op.cc | 389 ------- paddle/fluid/operators/spectral_op.cu | 38 - paddle/fluid/operators/spectral_op.cu.h | 1018 ----------------- paddle/fluid/operators/spectral_op.h | 507 -------- paddle/fluid/operators/stft_op.cc | 2 - paddle/fluid/operators/stft_op.cu | 1 - paddle/fluid/operators/stft_op.h | 46 +- paddle/fluid/operators/tdm_child_op.cc | 2 +- .../operators/tensorrt/tensorrt_engine_op.cc | 2 +- .../operators/tensorrt/tensorrt_engine_op.h | 4 +- .../tensorrt/tensorrt_engine_op_test.cc | 4 +- paddle/fluid/operators/tile_op.cc | 36 - paddle/fluid/operators/transpose_op.cc | 2 +- paddle/fluid/operators/unpool_op.cc | 141 +-- paddle/fluid/operators/unpool_op.cu.cc | 29 - paddle/fluid/operators/unpool_op.h | 123 -- paddle/fluid/platform/CMakeLists.txt | 9 - .../fluid/platform/device/xpu/enforce_xpu.h | 10 +- .../device/xpu/tests/enforce_xpu_test.cc | 2 + .../fluid/platform/device/xpu/xpu2_op_list.h | 45 + paddle/fluid/platform/device_context.h | 4 +- paddle/fluid/platform/device_event.h | 6 - paddle/fluid/platform/device_event_base.h | 4 +- paddle/fluid/platform/device_event_xpu.cc | 126 -- paddle/fluid/platform/event.h | 2 + paddle/fluid/platform/flags.cc | 4 +- paddle/fluid/platform/mkldnn_reuse.h | 293 ++--- paddle/fluid/pybind/eager_functions.cc | 19 +- paddle/fluid/pybind/eager_utils.cc | 25 +- paddle/fluid/pybind/eager_utils.h | 6 +- paddle/fluid/pybind/imperative.cc | 2 +- paddle/fluid/pybind/inference_api.cc | 2 +- paddle/fluid/pybind/jit.cc | 26 +- paddle/fluid/pybind/op_function_common.cc | 3 +- paddle/fluid/pybind/op_function_generator.h | 1 + paddle/fluid/pybind/protobuf.cc | 59 +- paddle/fluid/pybind/pybind.cc | 134 ++- paddle/fluid/pybind/slice_utils.h | 4 +- paddle/fluid/pybind/tensor_py.h | 6 +- paddle/infrt/common/type.h | 62 +- paddle/phi/api/lib/api_custom_impl.cc | 6 +- paddle/phi/api/lib/data_transform.cc | 2 - paddle/phi/api/lib/tensor.cc | 6 +- paddle/phi/api/lib/tensor_method.cc | 11 +- paddle/phi/api/yaml/api.yaml | 39 +- paddle/phi/api/yaml/api_compat.yaml | 27 + paddle/phi/api/yaml/backward.yaml | 42 + paddle/phi/api/yaml/generator/api_gen.py | 2 +- paddle/phi/api/yaml/generator/filters.py | 4 +- paddle/phi/api/yaml/generator/generate_op.py | 1 + paddle/phi/api/yaml/generator/tests.py | 2 +- .../yaml/generator/wrapped_infermeta_gen.py | 6 + paddle/phi/api/yaml/legacy_api.yaml | 120 +- paddle/phi/api/yaml/legacy_backward.yaml | 87 +- paddle/phi/backends/gpu/cuda/cuda_info.cc | 1 + paddle/phi/backends/gpu/rocm/rocm_info.cc | 10 +- paddle/phi/backends/xpu/enforce_xpu.h | 8 + paddle/phi/common/backend.h | 3 +- paddle/phi/common/layout.h | 19 +- paddle/phi/core/compat/op_utils.h | 1 + paddle/phi/core/dense_tensor.h | 32 +- paddle/phi/core/enforce.cc | 5 +- paddle/phi/core/tensor_utils.cc | 13 + paddle/phi/core/utils/data_type.h | 14 + paddle/phi/core/visit_type.h | 14 + paddle/phi/infermeta/backward.cc | 77 ++ paddle/phi/infermeta/backward.h | 28 + paddle/phi/infermeta/binary.cc | 159 ++- paddle/phi/infermeta/binary.h | 45 +- paddle/phi/infermeta/multiary.cc | 117 ++ paddle/phi/infermeta/multiary.h | 36 + paddle/phi/infermeta/ternary.cc | 20 +- paddle/phi/infermeta/ternary.h | 3 +- paddle/phi/infermeta/unary.cc | 348 +++++- paddle/phi/infermeta/unary.h | 42 + paddle/phi/kernels/CMakeLists.txt | 52 +- paddle/phi/kernels/affine_grid_grad_kernel.h | 1 - paddle/phi/kernels/affine_grid_kernel.h | 1 - paddle/phi/kernels/assign_kernel.h | 11 + .../phi/kernels/class_center_sample_kernel.h | 33 + .../kernels/cpu/affine_grid_grad_kernel.cc | 2 + paddle/phi/kernels/cpu/affine_grid_kernel.cc | 2 + .../kernels/cpu/class_center_sample_kernel.cc | 122 ++ paddle/phi/kernels/cpu/compare_kernel.cc | 18 +- paddle/phi/kernels/cpu/dropout_grad_kernel.cc | 17 +- paddle/phi/kernels/cpu/dropout_kernel.cc | 12 +- paddle/phi/kernels/cpu/fft_grad_kernel.cc | 32 + paddle/phi/kernels/cpu/fft_kernel.cc | 32 + .../kernels/cpu/fill_diagonal_grad_kernel.cc | 3 +- .../phi/kernels/cpu/fill_diagonal_kernel.cc | 3 +- .../cpu/fill_diagonal_tensor_grad_kernel.cc | 76 ++ .../cpu/fill_diagonal_tensor_kernel.cc | 140 +++ paddle/phi/kernels/cpu/fill_grad_kernel.cc | 29 + paddle/phi/kernels/cpu/fill_kernel.cc | 29 + paddle/phi/kernels/cpu/fold_grad_kernel.cc | 22 + paddle/phi/kernels/cpu/fold_kernel.cc | 21 + .../cpu/generate_proposals_v2_kernel.cc | 392 +++++++ .../phi/kernels/cpu/graph_send_recv_kernel.cc | 43 +- paddle/phi/kernels/cpu/lamb_kernel.cc | 20 + .../cpu/margin_cross_entropy_kernel.cc | 50 + .../phi/kernels/cpu/multiclass_nms3_kernel.cc | 1 - paddle/phi/kernels/cpu/scale_kernel.cc | 4 +- paddle/phi/kernels/cpu/unpool_grad_kernel.cc | 137 +++ paddle/phi/kernels/cpu/unpool_kernel.cc | 132 +++ paddle/phi/kernels/dropout_grad_kernel.h | 4 +- paddle/phi/kernels/dropout_kernel.h | 4 +- paddle/phi/kernels/empty_kernel.cc | 16 + paddle/phi/kernels/fft_grad_kernel.h | 48 + paddle/phi/kernels/fft_kernel.h | 47 + .../phi/kernels/fill_diagonal_grad_kernel.h | 2 - paddle/phi/kernels/fill_diagonal_kernel.h | 3 - .../fill_diagonal_tensor_grad_kernel.h | 37 + .../phi/kernels/fill_diagonal_tensor_kernel.h | 38 + .../fill_grad_kernel.h} | 15 +- ...l_diagonal_kernel_impl.h => fill_kernel.h} | 16 +- paddle/phi/kernels/fold_grad_kernel.h | 31 + paddle/phi/kernels/fold_kernel.h | 30 + paddle/phi/kernels/funcs/CMakeLists.txt | 17 + paddle/phi/kernels/funcs/activation_functor.h | 4 + .../affine_grid_utils.h} | 1 - paddle/phi/kernels/funcs/common_shape.h | 11 + paddle/phi/kernels/funcs/compare_functors.h | 5 + paddle/phi/kernels/funcs/cufft_util.h | 160 +++ .../kernels/funcs}/detection/nms_util.h | 40 +- paddle/phi/kernels/funcs/fft.cc | 378 ++++++ paddle/phi/kernels/funcs/fft.cu | 346 ++++++ paddle/phi/kernels/funcs/fft.h | 103 ++ paddle/phi/kernels/funcs/fft_cache.h | 189 +++ paddle/phi/kernels/funcs/fft_fill_conj.h | 219 ++++ paddle/phi/kernels/funcs/fft_key.h | 115 ++ paddle/phi/kernels/funcs/hipfft_util.h | 184 +++ paddle/phi/kernels/funcs/lamb_functors.h | 463 ++++++++ paddle/phi/kernels/funcs/mkl_fft_utils.h | 172 +++ .../phi/kernels/funcs/onednn/mkldnn_reuse.h | 301 +++++ paddle/phi/kernels/funcs/sparse/scatter.cu.h | 9 +- .../funcs/sparse/sparse_blas_impl.cu.h | 4 +- .../kernels/funcs}/tensor_to_string.h | 37 +- .../kernels/generate_proposals_v2_kernel.h | 38 + .../kernels/gpu/affine_grid_grad_kernel.cu | 2 + paddle/phi/kernels/gpu/affine_grid_kernel.cu | 2 + paddle/phi/kernels/gpu/batch_norm_kernel.cu | 89 +- .../kernels/gpu/class_center_sample_kernel.cu | 598 ++++++++++ paddle/phi/kernels/gpu/dropout_grad_kernel.cu | 24 +- paddle/phi/kernels/gpu/dropout_kernel.cu | 8 +- paddle/phi/kernels/gpu/fft_grad_kernel.cu | 32 + paddle/phi/kernels/gpu/fft_kernel.cu | 32 + .../kernels/gpu/fill_diagonal_grad_kernel.cu | 3 +- .../phi/kernels/gpu/fill_diagonal_kernel.cu | 3 +- .../gpu/fill_diagonal_tensor_grad_kernel.cu | 114 ++ .../gpu/fill_diagonal_tensor_kernel.cu | 136 +++ paddle/phi/kernels/gpu/fill_grad_kernel.cu | 30 + paddle/phi/kernels/gpu/fill_kernel.cu | 30 + paddle/phi/kernels/gpu/fold_grad_kernel.cu | 22 + paddle/phi/kernels/gpu/fold_kernel.cu | 21 + .../gpu/generate_proposals_v2_kernel.cu | 589 ++++++++++ .../gpu/graph_sample_neighbors_kernel.cu | 33 +- .../phi/kernels/gpu/graph_send_recv_funcs.h | 2 +- .../phi/kernels/gpu/graph_send_recv_kernel.cu | 51 +- paddle/phi/kernels/gpu/lamb_kernel.cu | 30 + .../gpu/margin_cross_entropy_grad_kernel.cu | 243 ++++ .../gpu/margin_cross_entropy_kernel.cu | 483 ++++++++ .../phi/kernels/gpu/reduce_amin_amax_common.h | 3 + .../phi/kernels/gpu/reduce_sum_grad_kernel.cu | 3 + paddle/phi/kernels/gpu/scale_kernel.cu | 4 +- .../phi/kernels/gpu/sync_batch_norm_kernel.cu | 26 +- paddle/phi/kernels/gpu/unpool_grad_kernel.cu | 203 ++++ paddle/phi/kernels/gpu/unpool_kernel.cu | 188 +++ paddle/phi/kernels/graph_send_recv_kernel.h | 3 +- .../phi/kernels/impl/fft_grad_kernel_impl.h | 110 ++ paddle/phi/kernels/impl/fft_kernel_impl.h | 83 ++ .../phi/kernels/impl/fill_grad_kernel_impl.h | 38 + paddle/phi/kernels/impl/fill_kernel_impl.h | 43 + .../phi/kernels/impl/fold_grad_kernel_impl.h | 75 ++ paddle/phi/kernels/impl/fold_kernel_impl.h | 77 ++ paddle/phi/kernels/impl/lamb_kernel_impl.h | 296 +++++ paddle/phi/kernels/impl/reduce_grad.h | 3 + paddle/phi/kernels/kps/compare_kernel.cu | 18 +- paddle/phi/kernels/lamb_kernel.h | 44 + .../margin_cross_entropy_grad_kernel.h | 34 + .../phi/kernels/margin_cross_entropy_kernel.h | 35 + .../kernels/onednn/activation_grad_kernel.cc | 251 ++++ .../phi/kernels/onednn/activation_kernel.cc | 170 +++ paddle/phi/kernels/reduce_amax_kernel.cc | 3 + paddle/phi/kernels/reduce_amin_kernel.cc | 3 + paddle/phi/kernels/reduce_max_kernel.cc | 3 + paddle/phi/kernels/reduce_min_kernel.cc | 3 + paddle/phi/kernels/reduce_sum_kernel.cc | 3 + .../kernels/selected_rows/cpu/lamb_kernel.cc | 21 + .../kernels/selected_rows/gpu/lamb_kernel.cu | 30 + .../selected_rows/impl/lamb_kernel_impl.h | 351 ++++++ .../phi/kernels/selected_rows/lamb_kernel.h | 46 + paddle/phi/kernels/shape_kernel.cc | 4 +- .../phi/kernels/sparse/gpu/coalesce_kernel.cu | 2 +- paddle/phi/kernels/sparse/gpu/conv.cu.h | 147 ++- .../kernels/sparse/gpu/conv_grad_kernel.cu | 42 +- paddle/phi/kernels/sparse/gpu/conv_kernel.cu | 27 +- paddle/phi/kernels/sparse/gpu/mask_kernel.cu | 4 +- .../phi/kernels/sparse/gpu/mv_grad_kernel.cu | 4 +- .../kernels/sparse/gpu/pool_grad_kernel.cu | 2 +- paddle/phi/kernels/sparse/gpu/pool_kernel.cu | 2 +- .../kernels/sparse/gpu/softmax_grad_kernel.cu | 2 +- .../phi/kernels/sparse/gpu/softmax_kernel.cu | 18 +- .../kernels/sparse/gpu/sparse_utils_kernel.cu | 6 +- paddle/phi/kernels/unpool_grad_kernel.h | 47 + paddle/phi/kernels/unpool_kernel.h | 43 + .../phi/ops/compat/class_center_sample_sig.cc | 36 + paddle/phi/ops/compat/einsum_sig.cc | 12 +- .../ops/compat/fill_diagonal_tensor_sig.cc | 38 + paddle/phi/ops/compat/fill_sig.cc | 33 + .../lamb_op.cu => phi/ops/compat/fold_sig.cc} | 26 +- paddle/phi/ops/compat/graph_send_recv_sig.cc | 15 +- paddle/phi/ops/compat/lamb_sig.cc | 62 + .../ops/compat/margin_cross_entropy_sig.cc | 54 + paddle/phi/ops/compat/squeeze_sig.cc | 8 +- paddle/phi/ops/compat/unpool3d_sig.cc | 37 + paddle/phi/ops/compat/unpool_sig.cc | 36 + paddle/phi/ops/compat/unsqueeze_sig.cc | 37 +- python/paddle/__init__.py | 1 + .../auto_parallel/operators/dist_matmul.py | 538 ++++++++- python/paddle/distributed/collective.py | 4 + python/paddle/distributed/fleet/__init__.py | 5 + .../distributed/fleet/base/fleet_base.py | 123 +- .../parallel_layers/mp_layers.py | 58 +- .../meta_parallel/parallel_layers/random.py | 14 +- python/paddle/distributed/fleet/utils/fs.py | 28 +- .../distributed/launch/context/__init__.py | 5 +- .../distributed/launch/context/args_envs.py | 5 +- .../distributed/launch/context/device.py | 8 +- .../launch/controllers/collective.py | 4 +- .../launch/controllers/controller.py | 19 +- .../distributed/launch/controllers/ps.py | 16 +- .../distributed/launch/job/container.py | 13 +- .../distributed/launch/plugins/__init__.py | 9 +- python/paddle/distributed/ps/coordinator.py | 9 +- python/paddle/distributed/ps/the_one_ps.py | 148 ++- .../distributed/sharding/group_sharded.py | 2 +- python/paddle/distribution/transform.py | 4 +- python/paddle/fft.py | 32 +- .../quantization/quant2_int8_mkldnn_pass.py | 6 +- python/paddle/fluid/dataloader/dataset.py | 6 +- python/paddle/fluid/dygraph/base.py | 5 +- .../fluid/dygraph/dygraph_to_static/error.py | 9 +- .../dygraph_to_static/partial_program.py | 70 ++ python/paddle/fluid/dygraph/layers.py | 20 + python/paddle/fluid/executor.py | 14 +- python/paddle/fluid/framework.py | 63 +- python/paddle/fluid/initializer.py | 10 +- python/paddle/fluid/io.py | 44 +- python/paddle/fluid/layers/loss.py | 8 +- python/paddle/fluid/layers/nn.py | 32 +- python/paddle/fluid/layers/sequence_lod.py | 93 +- python/paddle/fluid/layers/tensor.py | 4 +- python/paddle/fluid/metrics.py | 2 +- .../custom_kernel/test_custom_kernel_dot.py | 14 +- .../custom_kernel/test_custom_kernel_load.py | 7 +- .../tests/custom_op/test_context_pool.py | 2 +- .../tests/custom_op/test_custom_attrs_jit.py | 4 +- .../tests/custom_op/test_custom_concat.py | 7 +- .../fluid/tests/custom_op/test_custom_conj.py | 7 +- .../tests/custom_op/test_custom_linear.py | 7 +- .../custom_op/test_custom_raw_op_kernel_op.py | 2 +- .../tests/custom_op/test_custom_relu_model.py | 36 +- .../custom_op/test_custom_relu_op_jit.py | 27 +- .../custom_op/test_custom_relu_op_setup.py | 68 +- .../custom_op/test_custom_simple_slice.py | 8 +- .../tests/custom_op/test_dispatch_jit.py | 7 +- .../tests/custom_op/test_multi_out_jit.py | 10 +- .../fluid/tests/custom_runtime/CMakeLists.txt | 2 +- .../custom_runtime/test_custom_cpu_plugin.py | 10 +- python/paddle/fluid/tests/test_detection.py | 14 +- python/paddle/fluid/tests/test_lod_tensor.py | 22 +- .../auto_parallel/test_dist_op_cost.py | 209 ++++ .../autograd/test_jvp_and_transpose.py | 29 + .../unittests/autograd/test_orig2prim.py | 20 + .../unittests/autograd/test_prim2orig.py | 20 + .../tests/unittests/autograd/test_primapi.py | 2 + .../tests/unittests/autograd/test_primops.py | 1 + .../fluid/tests/unittests/dist_fleet_ctr.py | 27 +- .../tests/unittests/dist_fleet_ctr_ps_gpu.py | 6 +- .../test_distribution_beta_static.py | 1 + .../test_distribution_categorical.py | 2 + .../test_distribution_constraint.py | 2 + .../test_distribution_dirichlet.py | 2 + .../test_distribution_dirichlet_static.py | 1 + .../test_distribution_expfamily.py | 2 + .../test_distribution_expfamily_static.py | 1 + .../test_distribution_independent.py | 2 + .../test_distribution_independent_static.py | 1 + .../distribution/test_distribution_normal.py | 2 + .../test_distribution_transform.py | 3 + .../test_distribution_transform_static.py | 2 + .../distribution/test_distribution_uniform.py | 2 + .../test_distribution_variable.py | 2 + .../tests/unittests/distribution/test_kl.py | 2 + .../unittests/distribution/test_kl_static.py | 2 + .../unittests/dygraph_group_sharded_api.py | 53 +- .../dygraph_group_sharded_api_eager.py | 48 +- .../test_convert_operators.py | 2 +- .../dygraph_to_static/test_deepcopy.py | 4 +- .../unittests/dygraph_to_static/test_error.py | 14 + .../test_gradient_aggregation.py | 60 + .../dygraph_to_static/test_partial_program.py | 2 +- .../dygraph_to_static/test_rollback.py | 8 +- .../unittests/dygraph_to_static/test_slice.py | 25 +- .../tests/unittests/fft/spectral_op_np.py | 127 +- .../fluid/tests/unittests/fft/test_fft.py | 6 +- .../tests/unittests/fft/test_spectral_op.py | 85 +- .../test_standalone_controlflow.py | 4 +- .../interpreter/test_standalone_executor.py | 6 +- ...ul_elementwise_add_activation_fuse_pass.py | 132 +++ ...mkldnn_matmul_elementwise_add_fuse_pass.py | 86 ++ ...ldnn_matmul_transpose_reshape_fuse_pass.py | 4 +- ...t_mkldnn_matmul_v2_activation_fuse_pass.py | 131 +++ ...dnn_matmul_v2_elementwise_add_fuse_pass.py | 101 ++ ...n_matmul_v2_transpose_reshape_fuse_pass.py | 4 +- ...n_reshape_transpose_matmul_v2_fuse_pass.py | 2 +- .../ir/inference/test_trt_convert_rnn.py | 253 ++++ .../tests/unittests/mlu/test_adam_op_mlu.py | 4 +- .../tests/unittests/mlu/test_adamw_op_mlu.py | 4 +- .../unittests/mlu/test_batch_norm_op_mlu.py | 9 +- .../mlu/test_batch_norm_op_mlu_v2.py | 6 +- .../tests/unittests/mlu/test_bce_loss_mlu.py | 30 +- .../mlu/test_bce_with_logits_loss_mlu.py | 40 +- .../mlu/test_bilinear_interp_v2_op_mlu.py | 10 +- .../mlu/test_collective_api_base_mlu.py | 22 +- .../unittests/mlu/test_collective_base_mlu.py | 64 +- .../unittests/mlu/test_dropout_op_mlu.py | 4 +- .../mlu/test_elementwise_max_op_mlu.py | 4 +- .../mlu/test_elementwise_min_op_mlu.py | 4 +- .../unittests/mlu/test_expand_v2_op_mlu.py | 2 +- .../mlu/test_fill_constant_op_mlu.py | 2 +- .../unittests/mlu/test_gather_nd_op_mlu.py | 2 +- .../tests/unittests/mlu/test_gather_op_mlu.py | 4 +- .../mlu/test_gaussian_random_op_mlu.py | 8 +- .../tests/unittests/mlu/test_gelu_op_mlu.py | 4 +- .../unittests/mlu/test_hard_sigmoid_op_mlu.py | 9 +- .../unittests/mlu/test_layer_norm_op_mlu.py | 12 +- .../unittests/mlu/test_leaky_relu_op_mlu.py | 4 +- .../tests/unittests/mlu/test_log_op_mlu.py | 8 +- .../unittests/mlu/test_log_softmax_op_mlu.py | 11 +- .../mlu/test_merged_momentum_op_mlu.py | 6 +- .../mlu/test_nearest_interp_v2_op_mlu.py | 6 +- .../tests/unittests/mlu/test_pool2d_op_mlu.py | 6 +- .../unittests/mlu/test_randperm_op_mlu.py | 24 +- .../tests/unittests/mlu/test_relu6_op_mlu.py | 4 +- .../tests/unittests/mlu/test_relu_op_mlu.py | 4 +- .../tests/unittests/mlu/test_scale_op_mlu.py | 4 +- .../unittests/mlu/test_scatter_op_mlu.py | 2 +- .../tests/unittests/mlu/test_slice_op_mlu.py | 8 +- .../test_softmax_with_cross_entropy_op_mlu.py | 4 +- .../tests/unittests/mlu/test_split_op_mlu.py | 20 +- .../mlu/test_squared_l2_norm_op_mlu.py | 2 +- .../tests/unittests/mlu/test_stack_op_mlu.py | 6 +- .../tests/unittests/mlu/test_tanh_op_mlu.py | 4 +- .../unittests/mlu/test_top_k_v2_op_mlu.py | 52 +- .../unittests/mlu/test_transpose_op_mlu.py | 8 +- .../unittests/mlu/test_tril_triu_op_mlu.py | 8 +- .../mlu/test_uniform_random_op_mlu.py | 14 +- .../tests/unittests/mlu/test_where_op_mlu.py | 6 +- .../tests/unittests/npu/test_adam_op_npu.py | 8 +- .../tests/unittests/npu/test_adamw_op_npu.py | 4 +- .../test_amp_check_finite_and_scale_op_npu.py | 4 +- .../unittests/npu/test_assign_value_op_npu.py | 26 +- .../unittests/npu/test_batch_norm_op_npu.py | 9 +- .../tests/unittests/npu/test_bce_loss_npu.py | 32 +- .../npu/test_beam_search_decode_op_npu.py | 5 +- .../tests/unittests/npu/test_clip_op_npu.py | 22 +- .../unittests/npu/test_collective_base_npu.py | 4 +- .../tests/unittests/npu/test_concat_op_npu.py | 6 +- .../tests/unittests/npu/test_cos_op_npu.py | 4 +- .../tests/unittests/npu/test_cumsum_op_npu.py | 16 +- .../unittests/npu/test_dropout_op_npu.py | 4 +- .../npu/test_elementwise_div_op_npu.py | 4 +- .../npu/test_elementwise_max_op_npu.py | 4 +- .../npu/test_elementwise_min_op_npu.py | 4 +- .../npu/test_elementwise_pow_op_npu.py | 4 +- .../npu/test_elementwise_sub_op_npu.py | 4 +- .../tests/unittests/npu/test_expand_op_npu.py | 2 +- .../unittests/npu/test_gather_nd_op_npu.py | 2 +- .../tests/unittests/npu/test_gather_op_npu.py | 8 +- .../npu/test_gaussian_random_op_npu.py | 8 +- .../tests/unittests/npu/test_gelu_op_npu.py | 4 +- .../unittests/npu/test_hard_sigmoid_op_npu.py | 8 +- .../unittests/npu/test_hard_swish_op_npu.py | 24 +- .../unittests/npu/test_index_select_op_npu.py | 8 +- .../unittests/npu/test_kldiv_loss_op_npu.py | 2 +- .../unittests/npu/test_layer_norm_op_npu.py | 9 +- .../unittests/npu/test_leaky_relu_op_npu.py | 4 +- .../tests/unittests/npu/test_log_op_npu.py | 4 +- .../unittests/npu/test_log_softmax_op_npu.py | 8 +- .../tests/unittests/npu/test_memcpy_op_npu.py | 8 +- .../npu/test_merged_momentum_op_npu.py | 6 +- .../unittests/npu/test_meshgrid_op_npu.py | 24 +- .../tests/unittests/npu/test_mul_op_npu.py | 20 +- .../unittests/npu/test_multinomial_op_npu.py | 36 +- .../npu/test_nearest_interp_op_npu.py | 6 +- .../npu/test_nearest_interp_v2_op_npu.py | 2 +- .../tests/unittests/npu/test_pad3d_op_npu.py | 28 +- .../tests/unittests/npu/test_pow_op_npu.py | 4 +- .../unittests/npu/test_reduce_sum_op_npu.py | 4 +- .../tests/unittests/npu/test_relu6_op_npu.py | 4 +- .../tests/unittests/npu/test_relu_op_npu.py | 4 +- .../unittests/npu/test_rmsprop_op_npu.py | 8 +- .../tests/unittests/npu/test_sgd_op_npu.py | 4 +- .../tests/unittests/npu/test_slice_op_npu.py | 4 +- .../unittests/npu/test_softmax_op_npu.py | 4 +- .../test_softmax_with_cross_entropy_op_npu.py | 4 +- .../tests/unittests/npu/test_split_op_npu.py | 20 +- .../tests/unittests/npu/test_sqrt_op_npu.py | 4 +- .../tests/unittests/npu/test_square_op_npu.py | 4 +- .../unittests/npu/test_squeeze_op_npu.py | 12 +- .../tests/unittests/npu/test_stack_op_npu.py | 16 +- .../tests/unittests/npu/test_tanh_op_npu.py | 4 +- .../unittests/npu/test_top_k_v2_op_npu.py | 52 +- .../unittests/npu/test_tril_triu_op_npu.py | 8 +- .../test_truncated_gaussian_random_op_npu.py | 2 +- .../npu/test_uniform_random_op_npu.py | 14 +- .../paddle/fluid/tests/unittests/op_test.py | 14 +- .../unittests/test_activation_sparse_op.py | 2 +- .../fluid/tests/unittests/test_assign_op.py | 8 +- .../tests/unittests/test_assign_value_op.py | 24 +- .../tests/unittests/test_attribute_var.py | 158 +++ .../test_auto_parallel_reshard_dpmppp.py | 8 + .../test_auto_parallel_reshard_mppp.py | 10 +- .../fluid/tests/unittests/test_base_layer.py | 6 +- .../unittests/test_beam_search_decode_op.py | 5 +- .../tests/unittests/test_bernoulli_op.py | 4 +- .../test_buffer_shared_memory_reuse_pass.py | 23 +- .../tests/unittests/test_calc_gradient.py | 2 +- .../fluid/tests/unittests/test_cast_op.py | 7 +- .../unittests/test_class_center_sample_op.py | 20 +- .../fluid/tests/unittests/test_compare_op.py | 100 +- .../tests/unittests/test_compiled_program.py | 4 +- .../fluid/tests/unittests/test_concat_op.py | 6 +- .../fluid/tests/unittests/test_conj_op.py | 8 +- .../fluid/tests/unittests/test_cuda_graph.py | 8 +- .../test_cuda_graph_partial_graph.py | 6 +- .../tests/unittests/test_cuda_stream_event.py | 2 +- .../fluid/tests/unittests/test_cumsum_op.py | 8 +- .../test_decoupled_py_reader_data_check.py | 4 +- .../tests/unittests/test_dist_fleet_ctr.py | 7 + .../fluid/tests/unittests/test_dot_op.py | 5 +- .../fluid/tests/unittests/test_dropout_op.py | 55 +- .../test_dynamic_rnn_stop_gradient.py | 2 +- .../tests/unittests/test_eager_run_program.py | 10 +- .../unittests/test_egr_code_generate_api.py | 4 +- .../tests/unittests/test_egr_python_api.py | 125 +- .../unittests/test_egr_string_tensor_api.py | 22 +- .../unittests/test_elementwise_min_op.py | 14 +- .../unittests/test_elementwise_pow_op.py | 6 +- .../test_embedding_id_stop_gradient.py | 2 +- .../test_executor_check_fetch_list.py | 2 +- ..._executor_return_tensor_not_overwriting.py | 4 +- .../fluid/tests/unittests/test_expand_op.py | 6 +- .../tests/unittests/test_expand_v2_op.py | 10 +- .../tests/unittests/test_exponential_op.py | 2 +- .../fluid/tests/unittests/test_fc_op.py | 2 +- .../fluid/tests/unittests/test_fetch_var.py | 12 +- .../fluid/tests/unittests/test_fill_any_op.py | 38 + .../tests/unittests/test_fill_constant_op.py | 2 +- .../unittests/test_fill_diagonal_tensor_op.py | 14 +- .../fluid/tests/unittests/test_fill_op.py | 2 +- .../fluid/tests/unittests/test_fold_op.py | 6 +- .../tests/unittests/test_fused_matmul_bias.py | 10 +- .../test_fused_multi_transformer_op.py | 55 + .../fluid/tests/unittests/test_gather_op.py | 2 +- .../test_generate_proposals_v2_op.py | 292 +++-- .../unittests/test_graph_send_recv_op.py | 292 ++++- .../test_imperative_auto_mixed_precision.py | 6 +- ...perative_auto_mixed_precision_for_eager.py | 6 +- .../unittests/test_imperative_auto_prune.py | 11 +- .../tests/unittests/test_imperative_basic.py | 86 +- .../test_imperative_data_parallel.py | 6 +- .../unittests/test_imperative_double_grad.py | 43 +- .../test_imperative_hook_for_layer.py | 18 +- .../test_imperative_load_static_param.py | 2 +- ..._imperative_lod_tensor_to_selected_rows.py | 5 +- .../unittests/test_imperative_numpy_bridge.py | 11 +- .../test_imperative_ocr_attention_model.py | 4 +- .../unittests/test_imperative_ptb_rnn.py | 14 +- ...test_imperative_ptb_rnn_sorted_gradient.py | 14 +- .../test_imperative_recurrent_usage.py | 8 +- .../unittests/test_imperative_save_load.py | 66 +- .../unittests/test_imperative_save_load_v2.py | 67 +- ..._imperative_selected_rows_to_lod_tensor.py | 8 +- .../test_imperative_static_runner_mnist.py | 10 +- .../test_imperative_static_runner_while.py | 2 +- ...imperative_trace_non_persistable_inputs.py | 2 +- ..._imperative_transformer_sorted_gradient.py | 22 +- .../unittests/test_imperative_triple_grad.py | 25 +- .../test_imperative_using_non_zero_gpu.py | 2 +- .../fluid/tests/unittests/test_initializer.py | 42 +- .../fluid/tests/unittests/test_inplace.py | 6 +- .../unittests/test_inplace_addto_strategy.py | 2 +- .../test_inplace_auto_generated_apis.py | 2 +- .../tests/unittests/test_jit_save_load.py | 60 +- .../fluid/tests/unittests/test_lambv2_op.py | 9 +- .../tests/unittests/test_layer_norm_op.py | 2 +- .../fluid/tests/unittests/test_layers.py | 375 +++--- .../fluid/tests/unittests/test_load_op.py | 2 +- .../fluid/tests/unittests/test_load_op_xpu.py | 2 +- .../test_load_state_dict_from_old_format.py | 2 +- .../tests/unittests/test_lod_tensor_array.py | 16 +- .../unittests/test_lookup_table_bf16_op.py | 4 +- .../unittests/test_lookup_table_v2_bf16_op.py | 4 +- .../unittests/test_margin_cross_entropy_op.py | 53 +- .../tests/unittests/test_math_op_patch.py | 104 +- .../unittests/test_math_op_patch_var_base.py | 421 +++---- .../tests/unittests/test_matmul_v2_op.py | 2 +- .../unittests/test_max_min_amax_amin_op.py | 4 + .../fluid/tests/unittests/test_mean_op.py | 4 +- .../fluid/tests/unittests/test_memcpy_op.py | 4 +- .../test_memory_reuse_exclude_feed_var.py | 2 +- .../tests/unittests/test_merged_adam_op.py | 2 +- .../unittests/test_merged_momentum_op.py | 6 +- .../tests/unittests/test_mixed_precision.py | 6 +- .../fluid/tests/unittests/test_mse_loss.py | 1 + .../tests/unittests/test_multinomial_op.py | 10 +- .../unittests/test_op_function_generator.py | 13 +- .../fluid/tests/unittests/test_ops_nms.py | 28 +- .../fluid/tests/unittests/test_optimizer.py | 3 +- .../fluid/tests/unittests/test_pad_op.py | 1 + .../test_paddle_imperative_double_grad.py | 10 +- .../tests/unittests/test_paddle_save_load.py | 256 ++--- .../unittests/test_paddle_save_load_binary.py | 15 +- ...st_parallel_executor_fetch_isolated_var.py | 2 +- ...el_executor_inference_feed_partial_data.py | 6 +- .../fluid/tests/unittests/test_parameter.py | 16 +- .../fluid/tests/unittests/test_poisson_op.py | 22 +- .../fluid/tests/unittests/test_program.py | 39 + .../fluid/tests/unittests/test_prune.py | 26 +- .../unittests/test_py_reader_combination.py | 4 +- .../fluid/tests/unittests/test_randint_op.py | 12 +- .../fluid/tests/unittests/test_randperm_op.py | 24 +- .../tests/unittests/test_real_imag_op.py | 6 +- .../fluid/tests/unittests/test_reverse_op.py | 4 +- .../paddle/fluid/tests/unittests/test_run.py | 33 +- .../fluid/tests/unittests/test_scale_op.py | 4 +- .../tests/unittests/test_scatter_nd_op.py | 5 +- .../fluid/tests/unittests/test_scatter_op.py | 2 +- .../tests/unittests/test_set_value_op.py | 76 +- .../tests/unittests/test_shuffle_batch_op.py | 2 +- .../fluid/tests/unittests/test_slice_op.py | 44 +- .../tests/unittests/test_sparse_unary_op.py | 2 +- .../fluid/tests/unittests/test_split_op.py | 16 + .../tests/unittests/test_split_program.py | 6 +- .../unittests/test_squared_l2_norm_op.py | 2 +- .../fluid/tests/unittests/test_stack_op.py | 10 +- .../tests/unittests/test_static_save_load.py | 59 +- .../unittests/test_static_save_load_bf16.py | 2 +- .../unittests/test_static_save_load_large.py | 4 +- .../tests/unittests/test_strided_slice_op.py | 9 +- .../fluid/tests/unittests/test_sum_op.py | 19 +- .../fluid/tests/unittests/test_tensor.py | 104 +- .../unittests/test_tensor_array_to_tensor.py | 68 +- .../tests/unittests/test_tensor_copy_from.py | 6 +- .../test_tensor_fill_diagonal_tensor.py | 5 + .../unittests/test_tensor_register_hook.py | 97 +- ...st_tensor_scalar_type_promotion_dynamic.py | 2 +- ...est_tensor_scalar_type_promotion_static.py | 2 +- .../fluid/tests/unittests/test_tensor_uva.py | 5 + .../tests/unittests/test_translated_layer.py | 10 +- .../tests/unittests/test_transpose_op.py | 8 +- .../fluid/tests/unittests/test_unbind_op.py | 460 ++++---- .../fluid/tests/unittests/test_unpool3d_op.py | 24 +- .../fluid/tests/unittests/test_unpool_op.py | 29 +- .../tests/unittests/test_unsqueeze_op.py | 10 +- .../fluid/tests/unittests/test_var_base.py | 313 +++-- .../fluid/tests/unittests/test_variable.py | 146 +-- .../test_view_op_reuse_allocation.py | 2 +- .../fluid/tests/unittests/test_where_op.py | 2 +- .../tests/unittests/test_while_loop_op.py | 2 +- .../tests/unittests/xpu/test_assign_op_xpu.py | 4 +- .../unittests/xpu/test_assign_value_op_xpu.py | 24 +- .../xpu/test_bilinear_interp_op_xpu.py | 2 +- .../tests/unittests/xpu/test_clip_op_xpu.py | 22 +- .../unittests/xpu/test_dropout_op_xpu.py | 23 +- .../tests/unittests/xpu/test_empty_op_xpu.py | 134 +++ .../xpu/test_fleet_exe_dist_model_run_xpu.py | 93 ++ .../xpu/test_gaussian_random_op_xpu.py | 8 +- .../tests/unittests/xpu/test_matmul_op_xpu.py | 17 +- .../tests/unittests/xpu/test_pool2d_op_xpu.py | 18 + .../tests/unittests/xpu/test_scale_op_xpu.py | 4 +- .../unittests/xpu/test_squeeze2_op_xpu.py | 13 +- .../unittests/xpu/test_squeeze_op_xpu.py | 120 +- .../unittests/xpu/test_unsqueeze2_op_xpu.py | 13 +- .../unittests/xpu/test_unsqueeze_op_xpu.py | 113 +- python/paddle/geometric/__init__.py | 19 + .../geometric/message_passing/__init__.py | 15 + .../geometric/message_passing/send_recv.py | 162 +++ .../paddle/geometric/message_passing/utils.py | 52 + python/paddle/incubate/autograd/primops.py | 5 + python/paddle/incubate/autograd/primrules.py | 20 +- python/paddle/incubate/autotune.py | 1 - .../incubate/operators/graph_send_recv.py | 120 +- python/paddle/jit/layer.py | 11 +- python/paddle/nn/functional/activation.py | 1 - python/paddle/nn/functional/common.py | 75 +- python/paddle/nn/functional/distance.py | 4 +- python/paddle/nn/functional/extension.py | 5 +- python/paddle/nn/functional/loss.py | 63 +- python/paddle/nn/functional/pooling.py | 53 +- python/paddle/nn/functional/vision.py | 2 - python/paddle/nn/initializer/constant.py | 2 +- python/paddle/nn/initializer/normal.py | 1 - python/paddle/nn/initializer/uniform.py | 1 - python/paddle/nn/initializer/xavier.py | 2 - python/paddle/nn/layer/activation.py | 1 - python/paddle/nn/layer/loss.py | 32 +- python/paddle/nn/layer/pooling.py | 2 +- python/paddle/nn/layer/vision.py | 2 - python/paddle/optimizer/lamb.py | 10 +- python/paddle/profiler/profiler.py | 2 - python/paddle/profiler/utils.py | 1 - python/paddle/static/io.py | 62 +- python/paddle/tensor/creation.py | 264 ++--- python/paddle/tensor/linalg.py | 3 +- python/paddle/tensor/manipulation.py | 49 +- python/paddle/tensor/math.py | 61 +- python/paddle/tensor/random.py | 1 - python/paddle/tensor/search.py | 5 +- python/paddle/tensor/stat.py | 1 - python/paddle/tests/test_dlpack.py | 17 +- python/paddle/tests/test_hapi_amp.py | 9 +- .../utils/cpp_extension/cpp_extension.py | 3 +- python/paddle/vision/ops.py | 19 +- python/setup.py.in | 12 +- tools/check_api_approvals.sh | 4 +- tools/coverage/gcda_clean.py | 15 +- tools/dockerfile/Dockerfile.ipu | 14 +- 822 files changed, 25011 insertions(+), 13613 deletions(-) create mode 100644 paddle/fluid/distributed/auto_parallel/CMakeLists.txt create mode 100644 paddle/fluid/distributed/auto_parallel/auto_parallel.proto create mode 100644 paddle/fluid/distributed/auto_parallel/device_mesh.cc create mode 100644 paddle/fluid/distributed/auto_parallel/device_mesh.h create mode 100644 paddle/fluid/distributed/auto_parallel/device_mesh_test.cc create mode 100644 paddle/fluid/distributed/auto_parallel/dist_attr.cc create mode 100644 paddle/fluid/distributed/auto_parallel/dist_attr.h create mode 100644 paddle/fluid/distributed/auto_parallel/dist_attr_test.cc create mode 100644 paddle/fluid/distributed/auto_parallel/dist_mapper.cc create mode 100644 paddle/fluid/distributed/auto_parallel/dist_mapper.h create mode 100644 paddle/fluid/distributed/auto_parallel/dist_mapper_test.cc create mode 100644 paddle/fluid/distributed/auto_parallel/process_mesh.cc create mode 100644 paddle/fluid/distributed/auto_parallel/process_mesh.h create mode 100644 paddle/fluid/distributed/auto_parallel/process_mesh_test.cc create mode 100644 paddle/fluid/distributed/auto_parallel/utils.h rename paddle/fluid/{operators/fill_diagonal_tensor_op.h => framework/inference_cached_ops.h} (58%) mode change 100644 => 100755 paddle/fluid/framework/ir/gpu_cpu_map_matmul_to_mul_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc rename paddle/fluid/framework/ir/mkldnn/{reshape_transpose_matmul_v2_mkldnn_fuse_pass.h => matmul_elementwise_add_mkldnn_fuse_pass.h} (57%) rename paddle/fluid/framework/ir/mkldnn/{matmul_transpose_reshape_fuse_pass.cc => matmul_transpose_reshape_mkldnn_fuse_pass.cc} (70%) rename paddle/fluid/framework/ir/mkldnn/{matmul_transpose_reshape_fuse_pass.h => matmul_transpose_reshape_mkldnn_fuse_pass.h} (80%) rename paddle/fluid/framework/ir/mkldnn/{matmul_transpose_reshape_fuse_pass_tester.cc => matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc} (92%) create mode 100644 paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.cc create mode 100644 paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h create mode 100644 paddle/fluid/inference/tensorrt/convert/fill_constant_batch_size_like_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/rnn_op.cc delete mode 100644 paddle/fluid/jit/ast.h create mode 100644 paddle/fluid/jit/engine/CMakeLists.txt rename paddle/fluid/jit/{base_function.h => engine/base_engine.h} (95%) create mode 100644 paddle/fluid/jit/engine/executor_engine.cc create mode 100644 paddle/fluid/jit/engine/executor_engine.h create mode 100644 paddle/fluid/jit/engine/pe_engine.cc create mode 100644 paddle/fluid/jit/engine/pe_engine.h delete mode 100644 paddle/fluid/jit/executor_function.h create mode 100644 paddle/fluid/jit/function.cc rename paddle/fluid/{framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.h => jit/function.h} (54%) delete mode 100644 paddle/fluid/jit/object.h delete mode 100644 paddle/fluid/jit/pe_function.h delete mode 100644 paddle/fluid/operators/class_center_sample_op.cu delete mode 100644 paddle/fluid/operators/class_center_sample_op.h delete mode 100644 paddle/fluid/operators/detection/generate_proposals_v2_op.cu delete mode 100644 paddle/fluid/operators/detection/prior_box_op.cu delete mode 100644 paddle/fluid/operators/erfinv_op.cc delete mode 100644 paddle/fluid/operators/fill_any_op.cu.cc delete mode 100644 paddle/fluid/operators/fill_any_op.h delete mode 100644 paddle/fluid/operators/fill_diagonal_tensor_op.cu delete mode 100644 paddle/fluid/operators/fold_op.cu delete mode 100644 paddle/fluid/operators/fold_op.h delete mode 100644 paddle/fluid/operators/margin_cross_entropy_op.cu delete mode 100644 paddle/fluid/operators/margin_cross_entropy_op.h delete mode 100644 paddle/fluid/operators/optimizers/lamb_op.h create mode 100644 paddle/fluid/operators/optimizers/merged_momentum_op_xpu.cc create mode 100644 paddle/fluid/operators/prim_ops/log_p_op.cc delete mode 100644 paddle/fluid/operators/spectral_helper.h delete mode 100644 paddle/fluid/operators/spectral_op.cc delete mode 100644 paddle/fluid/operators/spectral_op.cu delete mode 100644 paddle/fluid/operators/spectral_op.cu.h delete mode 100644 paddle/fluid/operators/spectral_op.h delete mode 100644 paddle/fluid/operators/unpool_op.cu.cc delete mode 100644 paddle/fluid/operators/unpool_op.h delete mode 100644 paddle/fluid/platform/device_event_xpu.cc mode change 100755 => 100644 paddle/phi/infermeta/backward.h create mode 100644 paddle/phi/kernels/class_center_sample_kernel.h create mode 100644 paddle/phi/kernels/cpu/class_center_sample_kernel.cc create mode 100644 paddle/phi/kernels/cpu/fft_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/fft_kernel.cc create mode 100644 paddle/phi/kernels/cpu/fill_diagonal_tensor_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/fill_diagonal_tensor_kernel.cc create mode 100644 paddle/phi/kernels/cpu/fill_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/fill_kernel.cc create mode 100644 paddle/phi/kernels/cpu/fold_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/fold_kernel.cc create mode 100644 paddle/phi/kernels/cpu/generate_proposals_v2_kernel.cc create mode 100644 paddle/phi/kernels/cpu/lamb_kernel.cc create mode 100644 paddle/phi/kernels/cpu/margin_cross_entropy_kernel.cc create mode 100644 paddle/phi/kernels/cpu/unpool_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/unpool_kernel.cc create mode 100644 paddle/phi/kernels/fft_grad_kernel.h create mode 100644 paddle/phi/kernels/fft_kernel.h create mode 100644 paddle/phi/kernels/fill_diagonal_tensor_grad_kernel.h create mode 100644 paddle/phi/kernels/fill_diagonal_tensor_kernel.h rename paddle/phi/{ops/compat/erfinv_sig.cc => kernels/fill_grad_kernel.h} (68%) rename paddle/phi/kernels/{impl/fill_diagonal_kernel_impl.h => fill_kernel.h} (75%) create mode 100644 paddle/phi/kernels/fold_grad_kernel.h create mode 100644 paddle/phi/kernels/fold_kernel.h rename paddle/phi/kernels/{affine_grid_impl.h => funcs/affine_grid_utils.h} (98%) create mode 100644 paddle/phi/kernels/funcs/cufft_util.h rename paddle/{fluid/operators => phi/kernels/funcs}/detection/nms_util.h (84%) create mode 100644 paddle/phi/kernels/funcs/fft.cc create mode 100644 paddle/phi/kernels/funcs/fft.cu create mode 100644 paddle/phi/kernels/funcs/fft.h create mode 100644 paddle/phi/kernels/funcs/fft_cache.h create mode 100644 paddle/phi/kernels/funcs/fft_fill_conj.h create mode 100644 paddle/phi/kernels/funcs/fft_key.h create mode 100644 paddle/phi/kernels/funcs/hipfft_util.h create mode 100644 paddle/phi/kernels/funcs/lamb_functors.h create mode 100644 paddle/phi/kernels/funcs/mkl_fft_utils.h create mode 100644 paddle/phi/kernels/funcs/onednn/mkldnn_reuse.h rename paddle/{fluid/operators => phi/kernels/funcs}/tensor_to_string.h (66%) create mode 100644 paddle/phi/kernels/generate_proposals_v2_kernel.h create mode 100644 paddle/phi/kernels/gpu/class_center_sample_kernel.cu create mode 100644 paddle/phi/kernels/gpu/fft_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/fft_kernel.cu create mode 100644 paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu create mode 100644 paddle/phi/kernels/gpu/fill_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/fill_kernel.cu create mode 100644 paddle/phi/kernels/gpu/fold_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/fold_kernel.cu create mode 100644 paddle/phi/kernels/gpu/generate_proposals_v2_kernel.cu create mode 100644 paddle/phi/kernels/gpu/lamb_kernel.cu create mode 100644 paddle/phi/kernels/gpu/margin_cross_entropy_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/margin_cross_entropy_kernel.cu create mode 100644 paddle/phi/kernels/gpu/unpool_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/unpool_kernel.cu create mode 100644 paddle/phi/kernels/impl/fft_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/fft_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/fill_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/fill_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/fold_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/fold_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/lamb_kernel_impl.h create mode 100644 paddle/phi/kernels/lamb_kernel.h create mode 100644 paddle/phi/kernels/margin_cross_entropy_grad_kernel.h create mode 100644 paddle/phi/kernels/margin_cross_entropy_kernel.h create mode 100644 paddle/phi/kernels/onednn/activation_grad_kernel.cc create mode 100644 paddle/phi/kernels/onednn/activation_kernel.cc create mode 100644 paddle/phi/kernels/selected_rows/cpu/lamb_kernel.cc create mode 100644 paddle/phi/kernels/selected_rows/gpu/lamb_kernel.cu create mode 100644 paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h create mode 100644 paddle/phi/kernels/selected_rows/lamb_kernel.h create mode 100644 paddle/phi/kernels/unpool_grad_kernel.h create mode 100644 paddle/phi/kernels/unpool_kernel.h create mode 100644 paddle/phi/ops/compat/class_center_sample_sig.cc create mode 100644 paddle/phi/ops/compat/fill_diagonal_tensor_sig.cc create mode 100644 paddle/phi/ops/compat/fill_sig.cc rename paddle/{fluid/operators/optimizers/lamb_op.cu => phi/ops/compat/fold_sig.cc} (53%) create mode 100644 paddle/phi/ops/compat/lamb_sig.cc create mode 100644 paddle/phi/ops/compat/margin_cross_entropy_sig.cc create mode 100644 paddle/phi/ops/compat/unpool3d_sig.cc create mode 100644 paddle/phi/ops/compat/unpool_sig.cc create mode 100644 python/paddle/fluid/tests/unittests/dygraph_to_static/test_gradient_aggregation.py create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_rnn.py create mode 100644 python/paddle/fluid/tests/unittests/test_attribute_var.py create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_empty_op_xpu.py create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_fleet_exe_dist_model_run_xpu.py create mode 100644 python/paddle/geometric/__init__.py create mode 100644 python/paddle/geometric/message_passing/__init__.py create mode 100644 python/paddle/geometric/message_passing/send_recv.py create mode 100644 python/paddle/geometric/message_passing/utils.py diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake index 5dd84657c8605..c09ee715c3043 100644 --- a/cmake/external/cinn.cmake +++ b/cmake/external/cinn.cmake @@ -16,6 +16,12 @@ if(NOT WITH_CINN) return() endif() +if(NOT CINN_GIT_TAG) + set(CINN_GIT_TAG release/v0.2) +endif() + +message(STATUS "CINN version: " ${CINN_GIT_TAG}) + # TODO(zhhsplendid): CINN has lots of warnings during early development. # They will be treated as errors under paddle. We set no-error now and we will # clean the code in the future. @@ -26,7 +32,6 @@ add_definitions(-w) ###################################### include(ExternalProject) set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN) -set(CINN_GIT_TAG release/v0.2) set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION} -DWITH_CUDA=${WITH_GPU} diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 395efda6c6ab2..c47dbe88edcee 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so") if(NOT DEFINED XPU_BASE_URL) set(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220731") + set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220810") else() set(XPU_BASE_URL "${XPU_BASE_URL}") endif() @@ -19,7 +19,7 @@ endif() if(NOT DEFINED XPU_XDNN_BASE_URL) set(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev") - set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220731") + set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220810") else() set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}") endif() diff --git a/cmake/operators.cmake b/cmake/operators.cmake index e8d7ba1401ebe..c560dddfef5e7 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -510,7 +510,7 @@ function(op_library TARGET) if(WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) # Append first implemented MKLDNN activation operator if(${MKLDNN_FILE} STREQUAL "activation_mkldnn_op") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(gelu, MKLDNN);\n") elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n") diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt index 24e0a8c7a5d9f..b18ed421fcd78 100755 --- a/paddle/fluid/distributed/CMakeLists.txt +++ b/paddle/fluid/distributed/CMakeLists.txt @@ -47,3 +47,4 @@ add_subdirectory(ps) add_subdirectory(test) add_subdirectory(index_dataset) add_subdirectory(fleet_executor) +add_subdirectory(auto_parallel) diff --git a/paddle/fluid/distributed/auto_parallel/CMakeLists.txt b/paddle/fluid/distributed/auto_parallel/CMakeLists.txt new file mode 100644 index 0000000000000..976e76f8931ba --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/CMakeLists.txt @@ -0,0 +1,37 @@ +cc_library( + device_mesh + SRCS device_mesh.cc + DEPS auto_parallel_proto) +cc_test( + device_mesh_test + SRCS device_mesh_test.cc + DEPS device_mesh) + +cc_library( + process_mesh + SRCS process_mesh.cc + DEPS auto_parallel_proto) +cc_test( + process_mesh_test + SRCS process_mesh_test.cc + DEPS process_mesh) + +cc_library( + dist_attr + SRCS dist_attr.cc + DEPS process_mesh auto_parallel_proto proto_desc) +cc_test( + dist_attr_test + SRCS dist_attr_test.cc + DEPS dist_attr) + +cc_library( + dist_mapper + SRCS dist_mapper.cc + DEPS device_mesh auto_parallel_proto) +cc_test( + dist_mapper_test + SRCS dist_mapper_test.cc + DEPS dist_mapper) + +proto_library(auto_parallel_proto SRCS auto_parallel.proto) diff --git a/paddle/fluid/distributed/auto_parallel/auto_parallel.proto b/paddle/fluid/distributed/auto_parallel/auto_parallel.proto new file mode 100644 index 0000000000000..1413e80a8acb1 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/auto_parallel.proto @@ -0,0 +1,120 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless optional by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +syntax = "proto2"; + +package paddle.distributed.auto_parallel; + +// ProcessMesh is used to organize processes and like n-dimension array. +message ProcessMeshProto { + // The size of each dimension. + repeated int64 shape = 1; + + // These process ids are stored by a row-major way. + // There are no duplicate process ids within one process mesh. + repeated int64 process_ids = 2; + + // The name of each dimension. + repeated string dim_names = 3; + +} + +// This proto describes the capability of one device such as the computation and memory. +message DeviceCapabilityProto { + optional double single_precision_flops = 1; + + optional double double_precision_flops = 2; + + optional double memory_size_in_bytes = 3; + + optional double clock_rate_in_ghz = 4; +} + +// This proto represents a device. +message DeviceProto { + // The global id of this device within the cluster. + optional int64 global_id = 1; + + // The local id of this device within the machine. + optional int64 local_id = 2; + + // The id of the machine own this device. + optional int64 machine_id = 3; + + // The id of the machine has this device. + optional string type = 4; + + // The capability of this device. + optional DeviceCapabilityProto capability = 5; +} + +// This proto describes the capability of the link between two devices. +message LinkCapabilityProto { + optional int64 bandwidth = 1; // Bytes/s + optional int64 latency = 2; +} + +message LinkProto { + // The global id of the source device. + optional int64 source_id = 1; + + // The global id of the source device. + optional int64 target_id = 2; + + // Represent the link type. + optional string type = 3; + + // The capability of this link. + optional LinkCapabilityProto capability = 4; +} + +// DeviceMesh is used to organize devices and like n-dimension array. +message DeviceMeshProto { + // The global id of this mesh. + optional string name = 1; + + // The size of each dimension. + repeated int64 shape = 2; + + // These device ids are stored by a row-major way. + // There are no duplicate device ids within one device mesh. + repeated int64 device_ids = 3; + + // The name of each dimension. + repeated string dim_names = 4; + + // The devices of this mesh. + repeated DeviceProto devices = 5; + + // The links are between devices. + repeated LinkProto links = 6; +} + +// Record the mapping between the logical processes and the physical devices. +message DistributedMapperProto { + // The device meshes used by this distributed computation, + // which may be shared by different multiple device meshes. + repeated DeviceMeshProto device_meshes = 1; + + message MapperEntryProto { + optional int64 process_id = 1; + optional string device_mesh_name = 2; + repeated int64 device_ids = 3; + } + + // The mapping from process ids to device ids. + // It is also possible for one process to use multiple devices. + // It is possible for one device shared by multiple processes. + repeated MapperEntryProto process_id_to_device_ids = 2; +} diff --git a/paddle/fluid/distributed/auto_parallel/device_mesh.cc b/paddle/fluid/distributed/auto_parallel/device_mesh.cc new file mode 100644 index 0000000000000..6bf26ad6f74e4 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/device_mesh.cc @@ -0,0 +1,398 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/distributed/auto_parallel/device_mesh.h" +#include "paddle/fluid/distributed/auto_parallel/utils.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +std::string DeviceCapability::to_string() const { + std::string str; + str += "{sflops: " + to_string_with_precision(single_precision_flops) + ", "; + str += "dflops: " + to_string_with_precision(double_precision_flops) + ", "; + str += "memory: " + to_string_with_precision(memory_size_in_bytes) + ", "; + str += "rate: " + to_string_with_precision(clock_rate_in_ghz) + "}"; + return str; +} + +DeviceCapability DeviceCapability::from_proto( + const DeviceCapabilityProto &proto) { + DeviceCapability capability; + capability.single_precision_flops = proto.single_precision_flops(); + capability.double_precision_flops = proto.double_precision_flops(); + capability.memory_size_in_bytes = proto.memory_size_in_bytes(); + capability.clock_rate_in_ghz = proto.clock_rate_in_ghz(); + return capability; +} + +DeviceCapabilityProto DeviceCapability::to_proto() const { + DeviceCapabilityProto proto; + proto.set_single_precision_flops(single_precision_flops); + proto.set_double_precision_flops(double_precision_flops); + proto.set_memory_size_in_bytes(memory_size_in_bytes); + proto.set_clock_rate_in_ghz(clock_rate_in_ghz); + return proto; +} + +std::string Device::to_string() const { + std::string str = "{global_id: " + std::to_string(global_id_) + ", "; + str += "local_id: " + std::to_string(local_id_) + ", "; + str += "machine_id: " + std::to_string(machine_id_) + ", "; + str += "type: " + type_ + ", "; + str += "capability: " + capability_.to_string() + "}"; + return str; +} + +Device Device::from_proto(const DeviceProto &proto) { + Device device; + device.global_id_ = proto.global_id(); + device.local_id_ = proto.local_id(); + device.machine_id_ = proto.machine_id(); + device.type_ = proto.type(); + device.capability_ = DeviceCapability::from_proto(proto.capability()); + return device; +} + +DeviceProto Device::to_proto() const { + DeviceProto proto; + proto.set_global_id(global_id_); + proto.set_local_id(local_id_); + proto.set_machine_id(machine_id_); + proto.set_type(type_); + proto.mutable_capability()->CopyFrom(capability_.to_proto()); + return proto; +} + +bool operator==(const Device &lhs, const Device &rhs) { + if (lhs.global_id() != rhs.global_id()) { + return false; + } + if (lhs.local_id() != rhs.local_id()) { + return false; + } + if (lhs.machine_id() != rhs.machine_id()) { + return false; + } + if (lhs.type() != rhs.type()) { + return false; + } + return true; +} + +std::string LinkCapability::to_string() const { + std::string str; + str += "{bandwidth: " + to_string_with_precision(bandwidth) + ","; + str += "latency: " + to_string_with_precision(latency) + "}"; + return str; +} + +LinkCapability LinkCapability::from_proto(const LinkCapabilityProto &proto) { + LinkCapability capability; + capability.bandwidth = proto.bandwidth(); + capability.latency = proto.latency(); + return capability; +} + +LinkCapabilityProto LinkCapability::to_proto() const { + LinkCapabilityProto proto; + proto.set_bandwidth(bandwidth); + proto.set_latency(latency); + return proto; +} + +std::string Link::to_string() const { + std::string str = "{source_id:" + std::to_string(source_id_) + ","; + str += "target_id:" + std::to_string(target_id_) + ","; + str += "type:" + type_ + ","; + str += "capability:" + capability_.to_string() + "}"; + return str; +} + +Link Link::from_proto(const LinkProto &proto) { + Link link; + link.source_id_ = proto.source_id(); + link.target_id_ = proto.target_id(); + link.type_ = proto.type(); + link.capability_ = LinkCapability::from_proto(proto.capability()); + return link; +} + +LinkProto Link::to_proto() const { + LinkProto proto; + proto.set_source_id(source_id_); + proto.set_target_id(target_id_); + proto.set_type(type_); + proto.mutable_capability()->CopyFrom(capability_.to_proto()); + return proto; +} + +bool operator==(const Link &lhs, const Link &rhs) { + if (lhs.source_id() != rhs.source_id()) { + return false; + } + if (lhs.target_id() != rhs.target_id()) { + return false; + } + if (lhs.type() != rhs.type()) { + return false; + } + return true; +} + +bool Machine::contains(int64_t device_id) const { + if (devices_.count(device_id) == 1) { + return true; + } else { + return false; + } +} + +void Machine::add_device(const Device &device) { + if (id() == -1) { + set_id(device.machine_id()); + } else { + PADDLE_ENFORCE_EQ(device.machine_id(), + id(), + platform::errors::InvalidArgument( + "The machine id [%d] of the device should be equal " + "to this machine id [%d].", + device.machine_id(), + id_)); + } + devices_[device.global_id()] = &device; +} + +void Machine::add_link(const Link &link) { + PADDLE_ENFORCE_EQ(contains(link.source_id()), + true, + platform::errors::InvalidArgument( + "The source device id of the added link [%s] " + "cannot be found in the device_ids. Please add the " + "source device before adding this link", + std::to_string(link.source_id()))); + links_[link.source_id()][link.target_id()] = &link; +} + +std::string Machine::to_string() const { + std::string str = "{devices: ["; + for (const auto &device : devices_) { + str += device.second->to_string() + ", "; + } + str.replace(str.size() - 2, 2, "], "); + + str += "links: ["; + for (const auto &item : links_) { + str += "{"; + str += "source_id: " + std::to_string(item.first) + ", neighbors: ["; + for (const auto &link : item.second) { + str += link.second->to_string() + ", "; + } + str.replace(str.size() - 2, 2, "]}, "); + } + str.replace(str.size() - 4, 4, "]}"); + return str; +} + +DeviceMesh::DeviceMesh(const std::string &name, + const std::vector &shape, + const std::vector &device_ids, + const std::vector &dim_names) { + name_ = name; + shape_ = shape; + int64_t size = this->size(); + + PADDLE_ENFORCE_EQ(size, + device_ids.size(), + platform::errors::InvalidArgument( + "The size %d of this device mesh must be " + "equal to the size %d of its device ids.", + size, + device_ids.size())); + PADDLE_ENFORCE_EQ( + has_duplicates(device_ids), + false, + platform::errors::InvalidArgument("The device ids [%s] must be unique.", + str_join(device_ids))); + device_ids_ = device_ids; + + PADDLE_ENFORCE_EQ( + shape_.size(), + dim_names.size(), + platform::errors::InvalidArgument( + "The size %d of mesh shape must be equal to the size %d " + "of the dimension names.", + shape_.size(), + dim_names.size())); + PADDLE_ENFORCE_EQ(has_duplicates(dim_names), + false, + platform::errors::InvalidArgument( + "The names [%s] of each dimension must be unique.", + str_join(dim_names))); + dim_names_ = dim_names; +} + +int64_t DeviceMesh::size() const { + if (shape_.empty()) return 0; + int64_t size = 1; + for (const int64_t dim_size : shape_) size *= dim_size; + return size; +} + +bool DeviceMesh::contains(int64_t device_id) const { + auto result = + std::find(std::begin(device_ids_), std::end(device_ids_), device_id); + if (result != std::end(device_ids_)) { + return true; + } else { + return false; + } +} + +void DeviceMesh::add_device(const Device &device) { + PADDLE_ENFORCE_EQ( + contains(device.global_id()), + true, + platform::errors::InvalidArgument( + "The added device id [%s] cannot be found in the device_ids.", + std::to_string(device.global_id()))); + // Operator [] will create a new object if it cannot find one. + // So we add the default constructor for Device and Machine + // to make sure the new object can be created. + devices_[device.global_id()] = device; + machines_[device.machine_id()].add_device(devices_[device.global_id()]); +} + +void DeviceMesh::add_link(const Link &link) { + PADDLE_ENFORCE_EQ( + contains(link.source_id()), + true, + platform::errors::InvalidArgument("The source id of the added link [%s] " + "cannot be found in the device_ids.", + std::to_string(link.source_id()))); + PADDLE_ENFORCE_EQ( + contains(link.target_id()), + true, + platform::errors::InvalidArgument("The source id of the added link [%s] " + "cannot be found in the device_ids.", + std::to_string(link.target_id()))); + // Operator [] will create a new object if it cannot find one. + // So we add the default constructor for Device and Machine + // to make sure the new object can be created. + links_[link.source_id()][link.target_id()] = link; + const Device &source_device = devices_[link.source_id()]; + machines_[source_device.machine_id()].add_link( + links_[link.source_id()][link.target_id()]); +} + +std::string DeviceMesh::to_string() const { + std::string mesh_str = "{name: " + name_ + ", "; + mesh_str += "shape: [" + str_join(shape_) + "], "; + mesh_str += "device_ids: [" + str_join(device_ids_) + "], "; + mesh_str += "dim_names: [" + str_join(dim_names_) + "], "; + mesh_str += "\ndevices: [\n"; + for (const auto &device : devices_) { + mesh_str += " " + device.second.to_string() + ",\n"; + } + mesh_str.replace(mesh_str.size() - 2, 2, "],"); + + mesh_str += "\nlinks: [\n"; + for (const auto &item : links_) { + mesh_str += " {"; + mesh_str += "source_id: " + std::to_string(item.first) + ", neighbors: ["; + for (const auto &link : item.second) { + mesh_str += link.second.to_string() + ", "; + } + mesh_str.replace(mesh_str.size() - 2, 2, "]},\n"); + } + mesh_str.replace(mesh_str.size() - 4, 4, "]}"); + return mesh_str; +} + +DeviceMesh DeviceMesh::from_proto(const DeviceMeshProto &proto) { + DeviceMesh mesh; + + mesh.name_ = proto.name(); + + mesh.shape_.resize(proto.shape_size()); + for (int64_t i = 0; i < proto.shape_size(); ++i) { + mesh.shape_[i] = proto.shape(i); + } + + mesh.device_ids_.resize(proto.device_ids_size()); + for (int64_t i = 0; i < proto.device_ids_size(); ++i) { + mesh.device_ids_[i] = proto.device_ids(i); + } + + mesh.dim_names_.resize(proto.dim_names_size()); + for (int64_t i = 0; i < proto.dim_names_size(); ++i) { + mesh.dim_names_[i] = proto.dim_names(i); + } + + for (int64_t i = 0; i < proto.devices_size(); ++i) { + mesh.add_device(Device::from_proto(proto.devices(i))); + } + + for (int64_t i = 0; i < proto.links_size(); ++i) { + mesh.add_link(Link::from_proto(proto.links(i))); + } + + return mesh; +} + +DeviceMeshProto DeviceMesh::to_proto() const { + DeviceMeshProto proto; + + proto.set_name(name_); + + for (const auto &i : shape_) { + proto.add_shape(i); + } + + for (const auto &i : device_ids_) { + proto.add_device_ids(i); + } + + for (const auto &i : dim_names_) { + proto.add_dim_names(i); + } + + for (const auto &device : devices_) { + proto.mutable_devices()->Add()->CopyFrom(device.second.to_proto()); + } + + for (const auto &neighbors : links_) { + for (const auto &link : neighbors.second) { + proto.mutable_links()->Add()->CopyFrom(link.second.to_proto()); + } + } + + return proto; +} + +bool operator==(const DeviceMesh &lhs, const DeviceMesh &rhs) { + // Use the unique name to do the fast comparison + if (lhs.name() != rhs.name()) { + return false; + } + return true; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/device_mesh.h b/paddle/fluid/distributed/auto_parallel/device_mesh.h new file mode 100644 index 0000000000000..15ec50f546d30 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/device_mesh.h @@ -0,0 +1,273 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/distributed/auto_parallel/auto_parallel.pb.h" +#include "paddle/fluid/distributed/auto_parallel/utils.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { +struct DeviceCapability { + double single_precision_flops = 0.0; + double double_precision_flops = 0.0; + double memory_size_in_bytes = 0.0; + double clock_rate_in_ghz = 0.0; + + // DeviceCapability from_string(const std::string& str); + std::string to_string() const; + + static DeviceCapability from_proto(const DeviceCapabilityProto& proto); + DeviceCapabilityProto to_proto() const; +}; + +inline std::ostream& operator<<(std::ostream& os, const DeviceCapability& obj) { + os << obj.to_string(); + return os; +} + +class Device { + public: + Device() = default; + Device(int64_t global_id, + int64_t local_id, + int64_t machine_id, + const std::string& type) + : global_id_(global_id), + local_id_(local_id), + machine_id_(machine_id), + type_(type) {} + + int64_t global_id() const { return global_id_; } + int64_t local_id() const { return local_id_; } + int64_t machine_id() const { return machine_id_; } + const std::string& type() const { return type_; } + + const DeviceCapability& capability() const { return capability_; } + void set_capability(const DeviceCapability& capability) { + capability_ = capability; + } + + // Device from_string(const std::string& mesh_str); + std::string to_string() const; + + static Device from_proto(const DeviceProto& proto); + DeviceProto to_proto() const; + + private: + int64_t global_id_; + int64_t local_id_; + int64_t machine_id_; + std::string type_; + DeviceCapability capability_; +}; + +inline std::ostream& operator<<(std::ostream& os, const Device& obj) { + os << obj.to_string(); + return os; +} + +bool operator==(const Device& lhs, const Device& rhs); + +inline bool operator!=(const Device& lhs, const Device& rhs) { + return !operator==(lhs, rhs); +} + +struct LinkCapability { + double bandwidth = 0.0; // Bytes/s + double latency = 0.0; + + // LinkCapability from_string(const std::string& str); + std::string to_string() const; + + static LinkCapability from_proto(const LinkCapabilityProto& proto); + LinkCapabilityProto to_proto() const; +}; + +inline std::ostream& operator<<(std::ostream& os, const LinkCapability& obj) { + os << obj.to_string(); + return os; +} + +class Link { + public: + Link() = default; + + Link(int64_t source_id, int64_t target_id, const std::string& type) + : source_id_(source_id), target_id_(target_id), type_(type) {} + + int64_t source_id() const { return source_id_; } + int64_t target_id() const { return target_id_; } + const std::string& type() const { return type_; } + + const LinkCapability& capability() const { return capability_; } + void set_capability(const LinkCapability& capability) { + capability_ = capability; + } + + // Link from_string(const std::string& str); + std::string to_string() const; + + static Link from_proto(const LinkProto& proto); + LinkProto to_proto() const; + + private: + int64_t source_id_; + int64_t target_id_; + std::string type_; + LinkCapability capability_; +}; + +inline std::ostream& operator<<(std::ostream& os, const Link& obj) { + os << obj.to_string(); + return os; +} + +bool operator==(const Link& lhs, const Link& rhs); + +inline bool operator!=(const Link& lhs, const Link& rhs) { + return !operator==(lhs, rhs); +} + +class Machine { + public: + Machine() = default; + + explicit Machine(int64_t id) : id_(id) {} + + int64_t id() const { return id_; } + + void set_id(int64_t id) { id_ = id; } + + bool contains(int64_t device_id) const; + + void add_device(const Device& device); + + void add_link(const Link& link); + + // Machine from_string(const std::string& str); + std::string to_string() const; + + private: + int64_t id_ = -1; + std::unordered_map devices_; + std::unordered_map> links_; +}; + +class DeviceMesh { + public: + DeviceMesh() = default; + + DeviceMesh(const std::string& name, + const std::vector& shape, + const std::vector& device_ids, + const std::vector& dim_names); + + const std::string& name() const { return name_; } + + void set_name(const std::string& name) { name_ = name; } + + const std::vector& shape() const { return shape_; } + + const std::vector& device_ids() const { return device_ids_; } + + const std::vector& dim_names() const { return dim_names_; } + + std::string device_type() const { + if (empty()) return std::string(); + return std::begin(devices_)->second.type(); + } + + const std::unordered_map& devices() const { + return devices_; + } + + const std::unordered_map>& links() + const { + return links_; + } + + const Device& device(int64_t global_id) const { + return devices_.at(global_id); + } + + const Link& link(int64_t source_id, int64_t target_id) const { + return links_.at(source_id).at(target_id); + } + + int64_t size() const; + int64_t ndim() const { return shape_.size(); } + + int64_t dim_size(int64_t dim) const { + int64_t cdim = canonical_dim(dim, shape_.size()); + return shape_[cdim]; + } + + int64_t dim_size(const std::string& dim_name) const { + for (std::size_t i = 0; i < dim_names_.size(); ++i) { + if (dim_names_[i] == dim_name) { + return shape_[i]; + } + } + PADDLE_THROW(platform::errors::InvalidArgument( + "Cannot find the dimension of %s in this device mesh.", dim_name)); + } + + bool empty() const { return (shape_.empty() || device_ids_.empty()); } + bool contains(int64_t device_id) const; + + void add_device(const Device& device); + void add_link(const Link& link); + + // DeviceMesh from_string(const std::string& mesh_str); + std::string to_string() const; + + static DeviceMesh from_proto(const DeviceMeshProto& proto); + DeviceMeshProto to_proto() const; + + private: + std::string name_; + std::vector shape_; + std::vector device_ids_; + std::vector dim_names_; + std::unordered_map devices_; + std::unordered_map> links_; + std::unordered_map machines_; +}; + +inline std::ostream& operator<<(std::ostream& os, const DeviceMesh& obj) { + os << obj.to_string(); + return os; +} + +bool operator==(const DeviceMesh& lhs, const DeviceMesh& rhs); + +inline bool operator!=(const DeviceMesh& lhs, const DeviceMesh& rhs) { + return !operator==(lhs, rhs); +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/device_mesh_test.cc b/paddle/fluid/distributed/auto_parallel/device_mesh_test.cc new file mode 100644 index 0000000000000..bdfc13baa424d --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/device_mesh_test.cc @@ -0,0 +1,93 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/distributed/auto_parallel/device_mesh.h" +#include +#include +#include "gtest/gtest.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +TEST(DeviceMesh, Ctor) { + std::vector shape = {2, 3}; + std::vector device_ids = {0, 1, 2, 3, 4, 5}; + std::vector dim_names = {"x", "y"}; + std::string device_type = "GPU"; + int64_t size = shape[0] * shape[1]; + + DeviceMesh device_mesh("mesh", shape, device_ids, dim_names); + for (int64_t i = 0; i < shape[0]; ++i) { + for (int64_t j = 0; j < shape[1]; ++j) { + int64_t global_id = i * shape[1] + j; + int64_t local_id = j; + int64_t machine_id = i; + device_mesh.add_device( + Device(global_id, local_id, machine_id, device_type)); + } + } + for (int64_t i = 0; i < size; ++i) { + for (int64_t j = 0; j < size; ++j) { + device_mesh.add_link(Link(i, j, "NVL")); + } + } + + EXPECT_EQ(device_mesh.name(), "mesh"); + EXPECT_EQ(device_mesh.shape(), shape); + EXPECT_EQ(device_mesh.device_ids(), device_ids); + EXPECT_EQ(device_mesh.dim_names()[0], "x"); + EXPECT_EQ(device_mesh.dim_names()[1], "y"); + EXPECT_EQ(device_mesh.device_type(), device_type); + EXPECT_EQ(device_mesh.size(), size); + EXPECT_EQ(device_mesh.ndim(), static_cast(shape.size())); + EXPECT_EQ(device_mesh.dim_size(0), shape[0]); + EXPECT_EQ(device_mesh.dim_size(-1), shape[1]); + EXPECT_EQ(device_mesh.dim_size("x"), shape[0]); + EXPECT_EQ(device_mesh.dim_size("y"), shape[1]); + EXPECT_EQ(device_mesh.empty(), false); + EXPECT_EQ(device_mesh.contains(0), true); + EXPECT_EQ(device_mesh.contains(6), false); + EXPECT_EQ(device_mesh.device(3).global_id(), 3); + EXPECT_EQ(device_mesh.device(3).local_id(), 0); + EXPECT_EQ(device_mesh.device(3).machine_id(), 1); + EXPECT_EQ(device_mesh.device(3).type(), "GPU"); + EXPECT_EQ(device_mesh.link(3, 4).source_id(), 3); + EXPECT_EQ(device_mesh.link(3, 4).target_id(), 4); + EXPECT_EQ(device_mesh.link(3, 4).type(), "NVL"); + for (int64_t i = 0; i < shape[0]; ++i) { + for (int64_t j = 0; j < shape[1]; ++j) { + int64_t global_id = i * shape[1] + j; + int64_t local_id = j; + int64_t machine_id = i; + auto device = device_mesh.devices().at(global_id); + EXPECT_EQ(device, Device(global_id, local_id, machine_id, device_type)); + } + } + for (int64_t i = 0; i < size; ++i) { + for (int64_t j = 0; j < size; ++j) { + EXPECT_EQ(device_mesh.links().at(i).at(j), Link(i, j, "NVL")); + } + } + std::stringstream sstream; + sstream << device_mesh; + EXPECT_EQ(sstream.str(), device_mesh.to_string()); + auto proto = device_mesh.to_proto(); + DeviceMesh new_device_mesh = DeviceMesh::from_proto(proto); + EXPECT_EQ(device_mesh, new_device_mesh); +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/dist_attr.cc b/paddle/fluid/distributed/auto_parallel/dist_attr.cc new file mode 100644 index 0000000000000..9f9609962fc4d --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/dist_attr.cc @@ -0,0 +1,533 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/distributed/auto_parallel/dist_attr.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_desc.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +std::vector TensorDistAttr::fields_{ + "process_mesh", "dims_mapping", "batch_dim", "dynamic_dims"}; + +TensorDistAttr::TensorDistAttr(const VarDesc& tensor) + : tensor_(&tensor), batch_dim_(0) { + set_default_dims_mapping(); + std::vector tensor_shape = tensor_->GetShape(); + for (std::size_t i = 0; i < tensor_shape.size(); ++i) { + dynamic_dims_.push_back(false); + } +} + +TensorDistAttr::TensorDistAttr(const TensorDistAttr& dist_attr) { + if (tensor_ == nullptr) { + tensor_ = dist_attr.tensor(); + } + set_process_mesh(dist_attr.process_mesh()); + set_dims_mapping(dist_attr.dims_mapping()); + set_batch_dim(dist_attr.batch_dim()); + set_dynamic_dims(dist_attr.dynamic_dims()); + set_annotated(dist_attr.annotated()); +} + +TensorDistAttr& TensorDistAttr::operator=(const TensorDistAttr& dist_attr) { + if (tensor_ == nullptr) { + tensor_ = dist_attr.tensor(); + } + set_process_mesh(dist_attr.process_mesh()); + set_dims_mapping(dist_attr.dims_mapping()); + set_batch_dim(dist_attr.batch_dim()); + set_dynamic_dims(dist_attr.dynamic_dims()); + set_annotated(dist_attr.annotated()); + return *this; +} + +void TensorDistAttr::set_process_mesh(const ProcessMesh& process_mesh) { + PADDLE_ENFORCE_EQ(verify_process_mesh(process_mesh), + true, + platform::errors::InvalidArgument( + "Wrong process mesh %s.", process_mesh.to_string())); + process_mesh_ = process_mesh; +} + +void TensorDistAttr::set_dims_mapping( + const std::vector& dims_mapping) { + PADDLE_ENFORCE_EQ(verify_dims_mapping(dims_mapping), + true, + platform::errors::InvalidArgument("Wrong dims_mapping %s.", + str_join(dims_mapping))); + dims_mapping_ = dims_mapping; +} + +void TensorDistAttr::set_batch_dim(int64_t batch_dim) { + PADDLE_ENFORCE_EQ( + verify_batch_dim(batch_dim), + true, + platform::errors::InvalidArgument( + "Wrong batch_dim %d in this distributed attribute.", batch_dim)); + if (tensor_ != nullptr) { + std::vector tensor_shape = tensor_->GetShape(); + int64_t canonical_batch_dim = canonical_dim(batch_dim, tensor_shape.size()); + batch_dim_ = canonical_batch_dim; + } else { + batch_dim_ = batch_dim; + } +} + +void TensorDistAttr::set_dynamic_dims(const std::vector& dynamic_dims) { + PADDLE_ENFORCE_EQ( + verify_dynamic_dims(dynamic_dims), + true, + platform::errors::InvalidArgument("The dynamic_dims [%s] is wrong.", + str_join(dynamic_dims))); + dynamic_dims_ = dynamic_dims; +} + +void TensorDistAttr::set_annotated( + const std::map& annotated) { + PADDLE_ENFORCE_EQ(verify_annotated(annotated), + true, + platform::errors::InvalidArgument( + "The annotated [%s] is wrong.", str_join(annotated))); + annotated_ = annotated; +} + +void TensorDistAttr::set_default_dims_mapping() { + if (tensor_ != nullptr) { + std::vector tensor_shape = tensor_->GetShape(); + dims_mapping_ = std::vector(tensor_shape.size(), -1); + } +} + +void TensorDistAttr::annotate(const std::string& name) { + auto result = std::find(std::begin(fields_), std::end(fields_), name); + if (result != std::end(fields_)) { + annotated_[name] = true; + } +} + +bool TensorDistAttr::verify_process_mesh( + const ProcessMesh& process_mesh) const { + if (!process_mesh_.empty()) { + for (int64_t dim_mapping : dims_mapping_) { + if (dim_mapping < -1 || dim_mapping >= process_mesh_.ndim()) { + return false; + } + } + } + return true; +} + +bool TensorDistAttr::verify_dims_mapping( + const std::vector& dims_mapping) const { + if (tensor_ != nullptr) { + std::vector tensor_shape = tensor_->GetShape(); + if (dims_mapping.size() != tensor_shape.size()) { + return false; + } + } + std::unordered_map map; + if (!process_mesh_.empty()) { + for (int64_t i : dims_mapping) { + if (i < -1 || i >= process_mesh_.ndim()) { + return false; + } + ++map[i]; + if (i != -1 && map[i] > 1) { + return false; + } + } + } else { + for (int64_t i : dims_mapping) { + ++map[i]; + if (i != -1 && map[i] > 1) { + return false; + } + } + } + return true; +} + +bool TensorDistAttr::verify_batch_dim(int64_t dim) const { + if (tensor_ != nullptr) { + std::vector tensor_shape = tensor_->GetShape(); + int64_t ndim = tensor_shape.size(); + if (dim < 0) { + dim = dim + ndim; + } + if (dim < 0 || dim >= ndim) { + return false; + } + } + return true; +} + +bool TensorDistAttr::verify_dynamic_dims( + const std::vector& dynamic_dims) const { + if (tensor_ != nullptr) { + std::vector tensor_shape = tensor_->GetShape(); + if (dynamic_dims.size() != tensor_shape.size()) { + return false; + } + } + return true; +} + +bool TensorDistAttr::verify_annotated( + const std::map& annotated) const { + for (const auto& item : annotated) { + auto result = std::find(std::begin(fields_), std::end(fields_), item.first); + if (result == std::end(fields_)) { + return false; + } + } + return true; +} + +bool TensorDistAttr::verify() const { + if (tensor_ == nullptr) { + return false; + } + if (!verify_process_mesh(process_mesh_)) { + return false; + } + if (!verify_dims_mapping(dims_mapping_)) { + return false; + } + if (!verify_batch_dim(batch_dim_)) { + return false; + } + if (!verify_dynamic_dims(dynamic_dims_)) { + return false; + } + if (!verify_annotated(annotated_)) { + return false; + } + return true; +} + +std::string TensorDistAttr::to_string() const { + std::string dist_str; + if (tensor_ != nullptr) { + dist_str = "{tensor_name: " + tensor_->Name() + ", "; + } else { + dist_str = "{tensor_name: None, "; + } + dist_str += "process_mesh: " + process_mesh_.to_string() + ", "; + dist_str += "dims_mappings: [" + str_join(dims_mapping_) + "], "; + dist_str += "batch_dim: " + std::to_string(batch_dim_) + ", "; + dist_str += "dynamic_dims: [" + str_join(dynamic_dims_) + "], "; + dist_str += "annotated: [" + str_join(annotated_) + "]}"; + return dist_str; +} + +bool operator==(const TensorDistAttr& lhs, const TensorDistAttr& rhs) { + if (lhs.process_mesh() != rhs.process_mesh()) { + return false; + } + if (lhs.dims_mapping() != rhs.dims_mapping()) { + return false; + } + if (lhs.batch_dim() != rhs.batch_dim()) { + return false; + } + if (lhs.dynamic_dims() != rhs.dynamic_dims()) { + return false; + } + return true; +} + +std::vector OperatorDistAttr::fields_{ + "process_mesh", "impl_type", "impl_idx"}; + +OperatorDistAttr::OperatorDistAttr(const OpDesc& op) : op_(&op) { + for (std::string name : op_->InputArgumentNames()) { + VarDesc* input = op_->Block()->FindVarRecursive(name); + inputs_[name] = input; + input_dist_attrs_[name] = TensorDistAttr(*input); + } + for (std::string name : op_->OutputArgumentNames()) { + VarDesc* output = op_->Block()->FindVarRecursive(name); + outputs_[name] = output; + output_dist_attrs_[name] = TensorDistAttr(*output); + } + impl_type_ = "default"; + impl_idx_ = 0; +} + +OperatorDistAttr::OperatorDistAttr(const OperatorDistAttr& dist_attr) { + if (op_ == nullptr) { + op_ = dist_attr.op(); + } + for (const auto& item : dist_attr.input_dist_attrs()) { + set_input_dist_attr(item.first, item.second); + } + for (const auto& item : dist_attr.output_dist_attrs()) { + set_output_dist_attr(item.first, item.second); + } + set_process_mesh(dist_attr.process_mesh()); + set_impl_type(dist_attr.impl_type()); + set_impl_idx(dist_attr.impl_idx()); + set_annotated(dist_attr.annotated()); +} + +OperatorDistAttr& OperatorDistAttr::operator=( + const OperatorDistAttr& dist_attr) { + if (op_ == nullptr) { + op_ = dist_attr.op(); + } + for (const auto& item : dist_attr.input_dist_attrs()) { + set_input_dist_attr(item.first, item.second); + } + for (const auto& item : dist_attr.output_dist_attrs()) { + set_output_dist_attr(item.first, item.second); + } + set_process_mesh(dist_attr.process_mesh()); + set_impl_type(dist_attr.impl_type()); + set_impl_idx(dist_attr.impl_idx()); + set_annotated(dist_attr.annotated()); + return *this; +} + +void OperatorDistAttr::set_input_dist_attr(const std::string& name, + const TensorDistAttr& dist_attr) { + PADDLE_ENFORCE_EQ( + verify_input_dist_attr(name, dist_attr), + true, + platform::errors::InvalidArgument( + "Wrong dist_attr %s for %s.", dist_attr.to_string(), name)); + input_dist_attrs_[name] = dist_attr; + // Make sure the process mesh of input be same as that of the op + input_dist_attrs_[name].set_process_mesh(process_mesh_); +} + +void OperatorDistAttr::set_output_dist_attr(const std::string& name, + const TensorDistAttr& dist_attr) { + PADDLE_ENFORCE_EQ( + verify_output_dist_attr(name, dist_attr), + true, + platform::errors::InvalidArgument( + "Wrong dist_attr %s for %s.", dist_attr.to_string(), name)); + output_dist_attrs_[name] = dist_attr; + // Make sure the process mesh of output be same as that of the op + output_dist_attrs_[name].set_process_mesh(process_mesh_); +} + +void OperatorDistAttr::set_process_mesh(const ProcessMesh& process_mesh) { + for (auto& item : input_dist_attrs_) { + item.second.set_process_mesh(process_mesh); + } + for (auto& item : output_dist_attrs_) { + item.second.set_process_mesh(process_mesh); + } + process_mesh_ = process_mesh; +} + +void OperatorDistAttr::annotate(const std::string& name) { + auto result = std::find(std::begin(fields_), std::end(fields_), name); + if (result != std::end(fields_)) { + annotated_[name] = true; + } + if (name == "process_mesh") { + for (auto& item : input_dist_attrs_) { + item.second.annotate(name); + } + for (auto& item : output_dist_attrs_) { + item.second.annotate(name); + } + } +} + +void OperatorDistAttr::set_annotated( + const std::map& annotated) { + PADDLE_ENFORCE_EQ(verify_annotated(annotated), + true, + platform::errors::InvalidArgument( + "The annotated [%s] is wrong.", str_join(annotated))); + annotated_ = annotated; +} + +bool OperatorDistAttr::verify_input_dist_attr( + const std::string& name, const TensorDistAttr& dist_attr) const { + if (!dist_attr.verify()) { + return false; + } + if (op_ != nullptr) { + if (dist_attr.tensor() != nullptr) { + if (name != dist_attr.tensor()->Name()) { + return false; + } + } + if (input_dist_attrs_.count(name) == 0) { + return false; + } + } + return true; +} + +bool OperatorDistAttr::verify_output_dist_attr( + const std::string& name, const TensorDistAttr& dist_attr) const { + if (!dist_attr.verify()) { + return false; + } + if (op_ != nullptr) { + if (dist_attr.tensor() != nullptr) { + if (name != dist_attr.tensor()->Name()) { + return false; + } + } + if (output_dist_attrs_.count(name) == 0) { + return false; + } + } + return true; +} + +bool OperatorDistAttr::verify_process_mesh( + const ProcessMesh& process_mesh) const { + if (process_mesh != process_mesh_) { + return false; + } + for (auto& item : input_dist_attrs_) { + if (item.second.process_mesh() != process_mesh) { + return false; + } + } + for (auto& item : output_dist_attrs_) { + if (item.second.process_mesh() != process_mesh) { + return false; + } + } + return true; +} + +bool OperatorDistAttr::verify_annotated( + const std::map& annotated) const { + for (const auto& item : annotated) { + auto result = std::find(std::begin(fields_), std::end(fields_), item.first); + if (result == std::end(fields_)) { + return false; + } + } + for (auto& item : input_dist_attrs_) { + if (!item.second.verify_annotated(item.second.annotated())) { + return false; + } + } + for (auto& item : output_dist_attrs_) { + if (!item.second.verify_annotated(item.second.annotated())) { + return false; + } + } + return true; +} + +bool OperatorDistAttr::verify() const { + if (op_ == nullptr) { + return false; + } + if (!verify_process_mesh(process_mesh_)) { + return false; + } + for (auto const& item : input_dist_attrs_) { + auto input_names = op_->InputArgumentNames(); + auto found = + std::find(std::begin(input_names), std::end(input_names), item.first); + if (found == std::end(input_names)) { + return false; + } + if (!verify_input_dist_attr(item.first, item.second)) { + return false; + } + } + for (auto const& item : output_dist_attrs_) { + auto output_names = op_->OutputArgumentNames(); + auto found = + std::find(std::begin(output_names), std::end(output_names), item.first); + if (found == std::end(output_names)) { + return false; + } + if (!verify_output_dist_attr(item.first, item.second)) { + return false; + } + } + return true; +} + +std::string OperatorDistAttr::to_string() const { + std::string str; + if (op_ != nullptr) { + str += "{op_type: " + op_->Type() + ", "; + } else { + str += "{op_type: None, "; + } + str += "impl_type: " + impl_type_ + ", "; + str += "impl_idx: " + std::to_string(impl_idx_) + ", "; + str += "annotated: [" + str_join(annotated_) + "], "; + str += "\nprocess_mesh: " + process_mesh_.to_string() + ", "; + str += "\ninput_dist_attrs: [\n"; + for (auto const& item : input_dist_attrs_) { + str += " " + item.second.to_string() + ",\n"; + } + str.replace(str.size() - 2, 2, "]"); + str += "\noutput_dist_attrs: [\n"; + for (auto const& item : output_dist_attrs_) { + str += " " + item.second.to_string() + ",\n"; + } + str.replace(str.size() - 2, 2, "]}"); + return str; +} + +bool operator==(const OperatorDistAttr& lhs, const OperatorDistAttr& rhs) { + if (lhs.process_mesh() != rhs.process_mesh()) { + return false; + } + if (lhs.impl_type() != rhs.impl_type()) { + return false; + } + if (lhs.impl_idx() != rhs.impl_idx()) { + return false; + } + for (auto const& item : lhs.input_dist_attrs()) { + if (rhs.input_dist_attrs().count(item.first) != 1) { + return false; + } + if (rhs.input_dist_attrs().at(item.first) != + lhs.input_dist_attrs().at(item.first)) { + return false; + } + } + for (auto const& item : lhs.output_dist_attrs()) { + if (rhs.output_dist_attrs().count(item.first) != 1) { + return false; + } + if (rhs.output_dist_attrs().at(item.first) != + lhs.output_dist_attrs().at(item.first)) { + return false; + } + } + return true; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/dist_attr.h b/paddle/fluid/distributed/auto_parallel/dist_attr.h new file mode 100644 index 0000000000000..ae089ef94b9d6 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/dist_attr.h @@ -0,0 +1,239 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/distributed/auto_parallel/auto_parallel.pb.h" +#include "paddle/fluid/distributed/auto_parallel/process_mesh.h" +#include "paddle/fluid/distributed/auto_parallel/utils.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { + +// Forward Declaration +namespace framework { + +class BlockDesc; +class OpDesc; +class ProgramDesc; +class VarDesc; + +} // namespace framework + +namespace distributed { +namespace auto_parallel { + +using framework::BlockDesc; +using framework::OpDesc; +using framework::ProgramDesc; +using framework::VarDesc; + +class TensorDistAttr { + public: + TensorDistAttr() = default; + + explicit TensorDistAttr(const VarDesc& tensor); + + TensorDistAttr(const TensorDistAttr& tensor); + + TensorDistAttr& operator=(const TensorDistAttr& dist_attr); + + const VarDesc* tensor() const { return tensor_; } + + const ProcessMesh& process_mesh() const { return process_mesh_; } + + void set_process_mesh(const ProcessMesh& process_mesh); + + const std::vector& dims_mapping() const { return dims_mapping_; } + + void set_dims_mapping(const std::vector& dims_mapping); + + int64_t batch_dim() const { return batch_dim_; } + + void set_batch_dim(int64_t batch_dim); + + const std::vector& dynamic_dims() const { return dynamic_dims_; } + + void set_dynamic_dims(const std::vector& dynamic_dims); + + const std::map& annotated() const { return annotated_; } + + void set_annotated(const std::map& annotated); + + void set_default_dims_mapping(); + + bool is_annotated(const std::string& name) const { + return annotated_.count(name) == 1; + } + + void annotate(const std::string& name); + + bool verify_process_mesh(const ProcessMesh& process_mesh) const; + + bool verify_dims_mapping(const std::vector& dims_mapping) const; + + bool verify_batch_dim(int64_t dim) const; + + bool verify_dynamic_dims(const std::vector& dynamic_dims) const; + + bool verify_annotated(const std::map& annotated) const; + + bool verify() const; + + // TensorDistAttr from_string(const std::string& dist_str); + std::string to_string() const; + + private: + static std::vector fields_; + const VarDesc* tensor_{nullptr}; + ProcessMesh process_mesh_; + std::vector dims_mapping_; + int64_t batch_dim_; + std::vector dynamic_dims_; + std::map annotated_; +}; + +inline std::ostream& operator<<(std::ostream& os, const TensorDistAttr& obj) { + os << obj.to_string(); + return os; +} + +bool operator==(const TensorDistAttr& lhs, const TensorDistAttr& rhs); + +inline bool operator!=(const TensorDistAttr& lhs, const TensorDistAttr& rhs) { + return !operator==(lhs, rhs); +} + +class OperatorDistAttr { + public: + OperatorDistAttr() = default; + + explicit OperatorDistAttr(const OpDesc& op); + + OperatorDistAttr(const OperatorDistAttr& dist_attr); + + OperatorDistAttr& operator=(const OperatorDistAttr& dist_attr); + + const OpDesc* op() const { return op_; } + + const VarDesc& input(const std::string& name) const { + return *inputs_.at(name); + } + + const VarDesc& output(const std::string& name) const { + return *outputs_.at(name); + } + + const std::map& input_dist_attrs() const { + return input_dist_attrs_; + } + + const std::map& output_dist_attrs() const { + return output_dist_attrs_; + } + + const TensorDistAttr& input_dist_attr(const std::string& name) const { + return input_dist_attrs_.at(name); + } + + TensorDistAttr& input_dist_attr(const std::string& name) { + return input_dist_attrs_.at(name); + } + + void set_input_dist_attr(const std::string& name, + const TensorDistAttr& dist_attr); + + const TensorDistAttr& output_dist_attr(const std::string& name) const { + return output_dist_attrs_.at(name); + } + + TensorDistAttr& output_dist_attr(const std::string& name) { + return output_dist_attrs_.at(name); + } + + void set_output_dist_attr(const std::string& name, + const TensorDistAttr& dist_attr); + + const ProcessMesh& process_mesh() const { return process_mesh_; } + + void set_process_mesh(const ProcessMesh& process_mesh); + + const std::string& impl_type() const { return impl_type_; } + + void set_impl_type(const std::string& impl_type) { impl_type_ = impl_type; } + + int64_t impl_idx() const { return impl_idx_; } + + void set_impl_idx(const int64_t& impl_idx) { impl_idx_ = impl_idx; } + + const std::map& annotated() const { return annotated_; } + + void set_annotated(const std::map& annotated); + + bool is_annotated(const std::string& name) const { + return annotated_.count(name) == 1; + } + + void annotate(const std::string& name); + + bool verify_input_dist_attr(const std::string& name, + const TensorDistAttr& dist_attr) const; + + bool verify_output_dist_attr(const std::string& name, + const TensorDistAttr& dist_attr) const; + + bool verify_process_mesh(const ProcessMesh& process_mesh) const; + + bool verify_annotated(const std::map& annotated) const; + + bool verify() const; + + // OperatorDistAttr from_string(const std::string& dist_str); + std::string to_string() const; + + private: + static std::vector fields_; + const OpDesc* op_{nullptr}; + std::map inputs_; + std::map outputs_; + std::map input_dist_attrs_; + std::map output_dist_attrs_; + ProcessMesh process_mesh_; + std::string impl_type_; + int64_t impl_idx_ = -1; + std::map annotated_; +}; + +inline std::ostream& operator<<(std::ostream& os, const OperatorDistAttr& obj) { + os << obj.to_string(); + return os; +} + +bool operator==(const OperatorDistAttr& lhs, const OperatorDistAttr& rhs); + +inline bool operator!=(const OperatorDistAttr& lhs, + const OperatorDistAttr& rhs) { + return !operator==(lhs, rhs); +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/dist_attr_test.cc b/paddle/fluid/distributed/auto_parallel/dist_attr_test.cc new file mode 100644 index 0000000000000..1b9ac4271b4fb --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/dist_attr_test.cc @@ -0,0 +1,142 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/distributed/auto_parallel/dist_attr.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_desc.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +TEST(DistAttr, ctor) { + ProgramDesc program; + auto* global_block = program.MutableBlock(0); + auto* x = global_block->Var("X"); + x->SetType(framework::proto::VarType::LOD_TENSOR); + x->SetLoDLevel(0); + x->SetDataType(framework::proto::VarType::FP32); + x->SetShape({1000, 784}); + + auto* y = global_block->Var("Y"); + y->SetType(framework::proto::VarType::LOD_TENSOR); + y->SetLoDLevel(0); + y->SetDataType(framework::proto::VarType::FP32); + y->SetShape({784, 100}); + + auto* op = global_block->AppendOp(); + op->SetType("mul"); + op->SetInput("X", {x->Name()}); + op->SetInput("Y", {y->Name()}); + + auto* out = global_block->Var("Out"); + out->SetType(framework::proto::VarType::LOD_TENSOR); + out->SetShape({1000, 100}); + op->SetOutput("Out", {out->Name()}); + + std::vector shape = {2, 4}; + std::vector process_ids = {0, 1, 2, 3, 4, 5, 6, 7}; + std::vector dim_names = {"x", "y"}; + ProcessMesh process_mesh(shape, process_ids, dim_names); + + std::vector shape2 = {2, 2}; + std::vector process_ids2 = {0, 1, 2, 3}; + std::vector dim_names2 = {"a", "b"}; + ProcessMesh process_mesh2(shape2, process_ids2, dim_names2); + + TensorDistAttr x_dist_attr(*x), y_dist_attr(*y), out_dist_attr(*out); + x_dist_attr.set_process_mesh(process_mesh); + x_dist_attr.set_dims_mapping(std::vector({0, -1})); + x_dist_attr.set_batch_dim(0); + x_dist_attr.set_dynamic_dims(std::vector({true, false})); + x_dist_attr.annotate("process_mesh"); + x_dist_attr.annotate("dims_mapping"); + EXPECT_EQ(x_dist_attr.process_mesh(), process_mesh); + EXPECT_EQ(x_dist_attr.dims_mapping(), std::vector({0, -1})); + EXPECT_EQ(x_dist_attr.batch_dim(), 0); + EXPECT_EQ(x_dist_attr.dynamic_dims(), std::vector({true, false})); + EXPECT_EQ(x_dist_attr.is_annotated("process_mesh"), true); + EXPECT_EQ(x_dist_attr.is_annotated("dims_mapping"), true); + EXPECT_EQ(x_dist_attr.verify(), true); + + std::stringstream x_sstream; + x_sstream << x_dist_attr; + EXPECT_EQ(x_sstream.str(), x_dist_attr.to_string()); + EXPECT_EQ(x_dist_attr, x_dist_attr); + + y_dist_attr.set_process_mesh(process_mesh); + y_dist_attr.set_dims_mapping(std::vector({-1, 0})); + y_dist_attr.set_batch_dim(-1); + y_dist_attr.set_dynamic_dims(std::vector({false, true})); + x_dist_attr.annotate("batch_dim"); + x_dist_attr.annotate("dynamic_dims"); + EXPECT_EQ(y_dist_attr.process_mesh(), process_mesh); + EXPECT_EQ(y_dist_attr.dims_mapping(), std::vector({-1, 0})); + EXPECT_EQ(y_dist_attr.batch_dim(), 1); + EXPECT_EQ(y_dist_attr.dynamic_dims(), std::vector({false, true})); + EXPECT_EQ(x_dist_attr.is_annotated("batch_dim"), true); + EXPECT_EQ(x_dist_attr.is_annotated("dynamic_dims"), true); + EXPECT_EQ(x_dist_attr.verify(), true); + + out_dist_attr.set_process_mesh(process_mesh); + out_dist_attr.set_dims_mapping(std::vector({0, 1})); + out_dist_attr.set_batch_dim(1); + out_dist_attr.set_dynamic_dims(std::vector({false, false})); + EXPECT_EQ(out_dist_attr.process_mesh(), process_mesh); + EXPECT_EQ(out_dist_attr.dims_mapping(), std::vector({0, 1})); + EXPECT_EQ(out_dist_attr.batch_dim(), 1); + EXPECT_EQ(out_dist_attr.dynamic_dims(), std::vector({false, false})); + EXPECT_EQ(out_dist_attr.verify(), true); + + OperatorDistAttr mul_dist_attr(*op); + mul_dist_attr.set_input_dist_attr(x->Name(), x_dist_attr); + mul_dist_attr.set_input_dist_attr(y->Name(), y_dist_attr); + mul_dist_attr.set_output_dist_attr(out->Name(), out_dist_attr); + mul_dist_attr.set_process_mesh(process_mesh2); + mul_dist_attr.set_impl_type("dist_mul"); + mul_dist_attr.set_impl_idx(0); + mul_dist_attr.annotate("process_mesh"); + mul_dist_attr.annotate("impl_type"); + mul_dist_attr.annotate("impl_idx"); + EXPECT_NE(mul_dist_attr.input_dist_attr(x->Name()), x_dist_attr); + EXPECT_NE(mul_dist_attr.input_dist_attr(y->Name()), y_dist_attr); + EXPECT_NE(mul_dist_attr.output_dist_attr(out->Name()), out_dist_attr); + EXPECT_EQ(mul_dist_attr.process_mesh(), process_mesh2); + EXPECT_EQ(mul_dist_attr.input_dist_attr(x->Name()).process_mesh(), + process_mesh2); + EXPECT_EQ(mul_dist_attr.input_dist_attr(y->Name()).process_mesh(), + process_mesh2); + EXPECT_EQ(mul_dist_attr.impl_type(), "dist_mul"); + EXPECT_EQ(mul_dist_attr.impl_idx(), 0); + EXPECT_EQ(mul_dist_attr.is_annotated("process_mesh"), true); + EXPECT_EQ(mul_dist_attr.is_annotated("impl_type"), true); + EXPECT_EQ(mul_dist_attr.is_annotated("impl_idx"), true); + EXPECT_EQ(mul_dist_attr.verify(), true); + + std::stringstream mul_sstream; + mul_sstream << mul_dist_attr; + EXPECT_EQ(mul_sstream.str(), mul_dist_attr.to_string()); + EXPECT_EQ(mul_dist_attr, mul_dist_attr); +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/dist_mapper.cc b/paddle/fluid/distributed/auto_parallel/dist_mapper.cc new file mode 100644 index 0000000000000..d0995604522e5 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/dist_mapper.cc @@ -0,0 +1,146 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/distributed/auto_parallel/dist_mapper.h" +#include "paddle/fluid/distributed/auto_parallel/utils.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +void DistributedMapper::set_process_id_to_device_ids( + const std::map>>& + process_id_to_device_ids) { + std::vector device_mesh_names; + for (const auto& item : device_meshes_) { + device_mesh_names.push_back(item.first); + } + for (const auto& item : process_id_to_device_ids) { + PADDLE_ENFORCE_GE( + item.first, + 0, + platform::errors::InvalidArgument( + "The process id %d must be greater than or equal to 0.", + item.first)); + std::string device_mesh_name = item.second.first; + const std::vector& device_ids = item.second.second; + PADDLE_ENFORCE_EQ( + device_meshes_.count(device_mesh_name), + 1, + platform::errors::InvalidArgument( + "Cannot find the device mesh %d in device_mesh ids [%s].", + device_mesh_name, + str_join(device_mesh_names))); + PADDLE_ENFORCE_EQ( + has_duplicates(device_ids), + false, + platform::errors::InvalidArgument( + "The mapped device ids [%s] of process_mesh %d must be unique.", + str_join(device_ids), + item.first)); + const DeviceMesh& device_mesh = device_meshes_[device_mesh_name]; + const std::vector cur_device_ids = device_mesh.device_ids(); + for (int64_t device_id : device_ids) { + bool found = + std::find(cur_device_ids.begin(), cur_device_ids.end(), device_id) != + cur_device_ids.end(); + PADDLE_ENFORCE_EQ( + found, + true, + platform::errors::InvalidArgument( + "The device id %d cannot be find in the device mesh [%s].", + device_id, + str_join(cur_device_ids))); + } + } + process_id_to_device_ids_ = process_id_to_device_ids; +} + +DistributedMapper DistributedMapper::from_proto( + const DistributedMapperProto& proto) { + DistributedMapper dist_mapper; + for (int64_t i = 0; i < proto.device_meshes_size(); ++i) { + dist_mapper.device_meshes_[proto.device_meshes(i).name()] = + DeviceMesh::from_proto(proto.device_meshes(i)); + } + for (int64_t i = 0; i < proto.process_id_to_device_ids_size(); ++i) { + int64_t process_id = proto.process_id_to_device_ids(i).process_id(); + std::string device_mesh_name = + proto.process_id_to_device_ids(i).device_mesh_name(); + std::vector device_ids; + int64_t num_devices = proto.process_id_to_device_ids(i).device_ids_size(); + for (int64_t j = 0; j < num_devices; ++j) { + device_ids.push_back(proto.process_id_to_device_ids(i).device_ids(j)); + } + dist_mapper.process_id_to_device_ids_[process_id].first = device_mesh_name; + dist_mapper.process_id_to_device_ids_[process_id].second = device_ids; + } + return dist_mapper; +} + +DistributedMapperProto DistributedMapper::to_proto() const { + DistributedMapperProto proto; + for (const auto& item : device_meshes_) { + proto.mutable_device_meshes()->Add()->CopyFrom(item.second.to_proto()); + } + for (const auto& outer : process_id_to_device_ids_) { + auto proto_item = proto.mutable_process_id_to_device_ids()->Add(); + proto_item->set_process_id(outer.first); + proto_item->set_device_mesh_name(outer.second.first); + for (const auto& inner : outer.second.second) { + proto_item->add_device_ids(inner); + } + } + return proto; +} + +std::string DistributedMapper::to_string() const { + std::string mapper_str = "{device_meshes: ["; + for (const auto& item : device_meshes_) { + mapper_str += item.second.to_string() + ", "; + } + mapper_str.replace(mapper_str.size() - 2, 2, "]"); + + mapper_str += "\nprocess_id_to_device_ids: ["; + for (const auto& item : process_id_to_device_ids_) { + mapper_str += "{"; + mapper_str += + "process_id: " + std::to_string(item.first) + ", device_ids: ["; + for (const auto& device_id : item.second.second) { + mapper_str += + "{" + item.second.first + ", " + std::to_string(device_id) + "}, "; + } + mapper_str.replace(mapper_str.size() - 2, 2, "]"); + mapper_str += "}, "; + } + mapper_str.replace(mapper_str.size() - 2, 2, "]"); + mapper_str += "}"; + return mapper_str; +} + +bool operator==(const DistributedMapper& lhs, const DistributedMapper& rhs) { + if (lhs.device_meshes() != rhs.device_meshes()) { + return false; + } + if (lhs.process_id_to_device_ids() != rhs.process_id_to_device_ids()) { + return false; + } + return true; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/dist_mapper.h b/paddle/fluid/distributed/auto_parallel/dist_mapper.h new file mode 100644 index 0000000000000..bd7f9790ad69f --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/dist_mapper.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include + +#include "paddle/fluid/distributed/auto_parallel/auto_parallel.pb.h" +#include "paddle/fluid/distributed/auto_parallel/device_mesh.h" +#include "paddle/fluid/distributed/auto_parallel/process_mesh.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +class DistributedMapper { + public: + DistributedMapper() = default; + + const std::map& device_meshes() const { + return device_meshes_; + } + + const DeviceMesh& device_mesh(const std::string& name) const { + return device_meshes_.at(name); + } + + void add_device_mesh(const DeviceMesh& device_mesh) { + device_meshes_[device_mesh.name()] = device_mesh; + } + + const std::map>>& + process_id_to_device_ids() const { + return process_id_to_device_ids_; + } + + void set_process_id_to_device_ids( + const std::map>>& + process_id_to_device_ids); + + // DistributedMapper from_string(const std::string& mapper_str); + std::string to_string() const; + + static DistributedMapper from_proto(const DistributedMapperProto& proto); + DistributedMapperProto to_proto() const; + + private: + std::map device_meshes_; + std::map>> + process_id_to_device_ids_; +}; + +bool operator==(const DistributedMapper& lhs, const DistributedMapper& rhs); + +inline std::ostream& operator<<(std::ostream& os, + const DistributedMapper& obj) { + os << obj.to_string(); + return os; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/dist_mapper_test.cc b/paddle/fluid/distributed/auto_parallel/dist_mapper_test.cc new file mode 100644 index 0000000000000..d427b9cbb09ed --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/dist_mapper_test.cc @@ -0,0 +1,72 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/distributed/auto_parallel/dist_mapper.h" +#include +#include +#include "gtest/gtest.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +TEST(DistributedMapper, Ctor) { + std::vector shape = {2, 3}; + std::vector device_ids = {0, 1, 2, 3, 4, 5}; + std::vector dim_names = {"x", "y"}; + std::string device_type = "GPU"; + int64_t size = shape[0] * shape[1]; + + DeviceMesh device_mesh("device_mesh", shape, device_ids, dim_names); + for (int64_t i = 0; i < shape[0]; ++i) { + for (int64_t j = 0; j < shape[1]; ++j) { + int64_t global_id = i * shape[1] + j; + int64_t local_id = j; + int64_t machine_id = i; + device_mesh.add_device( + Device(global_id, local_id, machine_id, device_type)); + } + } + for (int64_t i = 0; i < size; ++i) { + for (int64_t j = 0; j < size; ++j) { + device_mesh.add_link(Link(i, j, "NVL")); + } + } + + DistributedMapper dist_mapper; + dist_mapper.add_device_mesh(device_mesh); + std::map>> + process_id_to_device_ids; + process_id_to_device_ids[0] = {"device_mesh", {5}}; + process_id_to_device_ids[1] = {"device_mesh", {4}}; + process_id_to_device_ids[2] = {"device_mesh", {3}}; + process_id_to_device_ids[3] = {"device_mesh", {2}}; + process_id_to_device_ids[4] = {"device_mesh", {1}}; + process_id_to_device_ids[5] = {"device_mesh", {0}}; + dist_mapper.set_process_id_to_device_ids(process_id_to_device_ids); + + EXPECT_EQ(dist_mapper.device_meshes().at("device_mesh"), device_mesh); + EXPECT_EQ(dist_mapper.device_mesh("device_mesh"), device_mesh); + EXPECT_EQ(dist_mapper.process_id_to_device_ids(), process_id_to_device_ids); + std::stringstream sstream; + sstream << dist_mapper; + EXPECT_EQ(sstream.str(), dist_mapper.to_string()); + auto proto = dist_mapper.to_proto(); + DistributedMapper new_dist_mapper = DistributedMapper::from_proto(proto); + EXPECT_EQ(dist_mapper, new_dist_mapper); +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/process_mesh.cc b/paddle/fluid/distributed/auto_parallel/process_mesh.cc new file mode 100644 index 0000000000000..dda2873768997 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/process_mesh.cc @@ -0,0 +1,134 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/distributed/auto_parallel/process_mesh.h" +#include "paddle/fluid/distributed/auto_parallel/utils.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +ProcessMesh::ProcessMesh(const std::vector &shape, + const std::vector &process_ids, + const std::vector &dim_names) { + shape_ = shape; + int64_t size = this->size(); + PADDLE_ENFORCE_EQ( + size, + process_ids.size(), + platform::errors::InvalidArgument("The size of this process mesh must be " + "equal to the size of its process ids.", + size, + process_ids.size())); + PADDLE_ENFORCE_EQ( + has_duplicates(process_ids), + false, + platform::errors::InvalidArgument("The process ids [%s] must be unique.", + str_join(process_ids_))); + process_ids_ = process_ids; + + PADDLE_ENFORCE_EQ(shape_.size(), + dim_names.size(), + platform::errors::InvalidArgument( + "The size of mesh shape must be equal to the size " + "of the dimension names.", + shape_.size(), + dim_names_.size())); + PADDLE_ENFORCE_EQ(has_duplicates(dim_names), + false, + platform::errors::InvalidArgument( + "The names [%s] of each dimension must be unique.", + str_join(dim_names))); + dim_names_ = dim_names; +} + +int64_t ProcessMesh::size() const { + if (shape_.empty()) return 0; + int64_t size = 1; + for (const int64_t dim_size : shape_) size *= dim_size; + return size; +} + +bool ProcessMesh::contains(int64_t process_id) const { + auto result = + std::find(std::begin(process_ids_), std::end(process_ids_), process_id); + if (result != std::end(process_ids_)) { + return true; + } else { + return false; + } +} + +std::string ProcessMesh::to_string() const { + std::string mesh_str = "{shape: [" + str_join(shape_) + "], "; + mesh_str += "process_ids: [" + str_join(process_ids_) + "], "; + mesh_str += "dim_names: [" + str_join(dim_names_) + "]}"; + return mesh_str; +} + +ProcessMesh ProcessMesh::from_proto(const ProcessMeshProto &proto) { + ProcessMesh mesh; + + mesh.shape_.resize(proto.shape_size()); + for (int64_t i = 0; i < proto.shape_size(); ++i) { + mesh.shape_[i] = proto.shape(i); + } + + mesh.process_ids_.resize(proto.process_ids_size()); + for (int64_t i = 0; i < proto.process_ids_size(); ++i) { + mesh.process_ids_[i] = proto.process_ids(i); + } + + mesh.dim_names_.resize(proto.dim_names_size()); + for (int64_t i = 0; i < proto.dim_names_size(); ++i) { + mesh.dim_names_[i] = proto.dim_names(i); + } + + return mesh; +} + +ProcessMeshProto ProcessMesh::to_proto() const { + ProcessMeshProto proto; + + for (const auto &i : shape_) { + proto.add_shape(i); + } + + for (const auto &i : process_ids_) { + proto.add_process_ids(i); + } + + for (const auto &i : dim_names_) { + proto.add_dim_names(i); + } + + return proto; +} + +bool operator==(const ProcessMesh &lhs, const ProcessMesh &rhs) { + if (lhs.shape() != rhs.shape()) { + return false; + } + if (lhs.process_ids() != rhs.process_ids()) { + return false; + } + return true; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/process_mesh.h b/paddle/fluid/distributed/auto_parallel/process_mesh.h new file mode 100644 index 0000000000000..2652a8f606216 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/process_mesh.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/distributed/auto_parallel/auto_parallel.pb.h" +#include "paddle/fluid/distributed/auto_parallel/device_mesh.h" +#include "paddle/fluid/distributed/auto_parallel/utils.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +class ProcessMesh { + public: + ProcessMesh() = default; + + ProcessMesh(const std::vector& shape, + const std::vector& process_ids, + const std::vector& dim_names); + + const std::vector& shape() const { return shape_; } + + const std::vector& process_ids() const { return process_ids_; } + + const std::vector& dim_names() const { return dim_names_; } + + int64_t size() const; + + int64_t ndim() const { return shape_.size(); } + + int64_t dim_size(int64_t dim) const { + int64_t cdim = canonical_dim(dim, shape_.size()); + return shape_[cdim]; + } + + int64_t dim_size(const std::string& dim_name) const { + for (std::size_t i = 0; i < dim_names_.size(); ++i) { + if (dim_names_[i] == dim_name) { + return shape_[i]; + } + } + PADDLE_THROW(platform::errors::InvalidArgument( + "Cannot find the dimension of %s in this process mesh.", dim_name)); + } + + bool empty() const { return (shape_.empty() || process_ids_.empty()); } + bool contains(int64_t process_id) const; + + // ProcessMesh from_string(const std::string& mesh_str); + std::string to_string() const; + + static ProcessMesh from_proto(const ProcessMeshProto& proto); + ProcessMeshProto to_proto() const; + + private: + std::vector shape_; + std::vector process_ids_; + std::vector dim_names_; +}; + +inline std::ostream& operator<<(std::ostream& os, const ProcessMesh& obj) { + os << obj.to_string(); + return os; +} + +bool operator==(const ProcessMesh& lhs, const ProcessMesh& rhs); + +inline bool operator!=(const ProcessMesh& lhs, const ProcessMesh& rhs) { + return !operator==(lhs, rhs); +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/process_mesh_test.cc b/paddle/fluid/distributed/auto_parallel/process_mesh_test.cc new file mode 100644 index 0000000000000..9dbcc5ea2d31c --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/process_mesh_test.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/distributed/auto_parallel/process_mesh.h" +#include +#include +#include "gtest/gtest.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +TEST(ProcessMesh, Ctor) { + std::vector shape = {2, 3}; + std::vector process_ids = {0, 1, 2, 3, 4, 5}; + std::vector dim_names = {"x", "y"}; + int64_t size = shape[0] * shape[1]; + ProcessMesh process_mesh(shape, process_ids, dim_names); + EXPECT_EQ(process_mesh.shape(), shape); + EXPECT_EQ(process_mesh.process_ids(), process_ids); + EXPECT_EQ(process_mesh.dim_names()[0], "x"); + EXPECT_EQ(process_mesh.dim_names()[1], "y"); + EXPECT_EQ(process_mesh.size(), size); + EXPECT_EQ(process_mesh.ndim(), static_cast(shape.size())); + EXPECT_EQ(process_mesh.dim_size(0), shape[0]); + EXPECT_EQ(process_mesh.dim_size(-1), shape[1]); + EXPECT_EQ(process_mesh.dim_size("x"), shape[0]); + EXPECT_EQ(process_mesh.dim_size("y"), shape[1]); + EXPECT_EQ(process_mesh.empty(), false); + EXPECT_EQ(process_mesh.contains(0), true); + EXPECT_EQ(process_mesh.contains(6), false); + std::stringstream sstream; + sstream << process_mesh; + EXPECT_EQ(sstream.str(), process_mesh.to_string()); + auto proto = process_mesh.to_proto(); + ProcessMesh new_process_mesh = ProcessMesh::from_proto(proto); + EXPECT_EQ(process_mesh, new_process_mesh); +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/utils.h b/paddle/fluid/distributed/auto_parallel/utils.h new file mode 100644 index 0000000000000..de4162730b19c --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/utils.h @@ -0,0 +1,114 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +// struct Indent { +// Indent(int &level) : level(level) { ++level; } +// ~Indent() { --level; } +// int &level; +// }; + +// inline std::string str_indent(std::string& str, cur_indent) { +// string spaces(cur_indent, " "); +// return str + std::string(cur_indent, " "); +// } + +template +bool has_duplicates(const std::vector& vec) { + std::unordered_map map; + for (const auto& i : vec) { + ++map[i]; + if (map[i] > 1) return true; + } + return false; +} + +inline int64_t canonical_dim(int dim, int ndim) { + PADDLE_ENFORCE_EQ( + dim >= -ndim && dim < ndim, + true, + platform::errors::InvalidArgument( + "Dimension %d is outside of [-%d, %d).", dim, ndim, ndim)); + if (dim < 0) { + return dim + ndim; + } + return dim; +} + +// Refer to https://stackoverflow.com/a/5289170 +template +std::string str_join(Range const& elements, + const std::string& delimiter = ",") { + std::ostringstream os; + auto b = std::begin(elements), e = std::end(elements); + + if (b != e) { + std::copy(b, prev(e), std::ostream_iterator(os, delimiter.c_str())); + b = prev(e); + } + if (b != e) { + os << *b; + } + + return os.str(); +} + +inline std::string str_join(std::map const& elements, + const std::string& delimiter = ",") { + std::string str; + for (const auto& item : elements) { + str += item.first + ": " + std::to_string(item.second) + ","; + } + return str.substr(0, str.size() - 2); +} + +// Refer to https://stackoverflow.com/a/46931770 +inline std::vector str_split(std::string const& input, + const std::string& delimiter = ",") { + size_t pos_start = 0, pos_end, delim_len = delimiter.length(); + std::string token; + std::vector output; + while ((pos_end = input.find(delimiter, pos_start)) != std::string::npos) { + token = input.substr(pos_start, pos_end - pos_start); + pos_start = pos_end + delim_len; + output.push_back(token); + } + output.push_back(input.substr(pos_start)); + return output; +} + +// Refer to https://stackoverflow.com/a/29200671/2358969 +template +std::string to_string_with_precision(const T a_value, const int n = 2) { + std::ostringstream out; + out.precision(n); + out << std::fixed << a_value; + return out.str(); +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc index 3b3b505ffb80c..718b33903af8b 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc @@ -197,17 +197,6 @@ std::shared_ptr ProcessGroupHCCL::Collective( SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); auto task = CreateTask(places, rank_, op_type, inputs); - task->SetOutputs(outputs); - - // if (FLAGS_use_stream_safe_npu_allocator) { - // for (size_t i = 0; i < inputs.size(); ++i) { - // platform::NPUDeviceGuard guard(places[i].GetDeviceId()); - // auto dense_tensor = - // std::dynamic_pointer_cast(inputs[i].impl()); - // memory::RecordStream(dense_tensor->Holder(), - // places_to_ctx_[key][i]->stream()); - // } - // } for (size_t i = 0; i < inputs.size(); ++i) { platform::NPUDeviceGuard guard(places[i].GetDeviceId()); diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index d776f62373e43..168548cf9ba06 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -244,25 +244,24 @@ std::shared_ptr ProcessGroupNCCL::Collective( SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); auto task = CreateTask(places, rank_, op_type, inputs); - task->SetOutputs(outputs); // construct uninitialize guard for device platform::CUDADeviceGuard cuda_guard; - if (FLAGS_use_stream_safe_cuda_allocator) { + { + platform::NCCLGroupGuard nccl_guard; for (size_t i = 0; i < inputs.size(); ++i) { cuda_guard.SetDevice(places[i]); - memory::RecordStream(inputs[i].Holder(), - places_to_ctx_[key][i]->stream()); + const auto& nccl_stream = places_to_ctx_[key][i]->stream(); + fn(inputs[i], outputs[i], nccl_comms[i]->GetNcclComm(), nccl_stream); } } - { - platform::NCCLGroupGuard nccl_guard; + if (FLAGS_use_stream_safe_cuda_allocator) { for (size_t i = 0; i < inputs.size(); ++i) { cuda_guard.SetDevice(places[i]); - const auto& nccl_stream = places_to_ctx_[key][i]->stream(); - fn(inputs[i], outputs[i], nccl_comms[i]->GetNcclComm(), nccl_stream); + memory::RecordStream(inputs[i].Holder(), + places_to_ctx_[key][i]->stream()); } } diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc index 0b46369b970ab..b14bc4f7ed4de 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.cc +++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc @@ -89,10 +89,23 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data, #else PADDLE_THROW(paddle::platform::errors::Fatal( "Paddle wasn't compiled with CUDA, but place is GPU.")); +#endif + } else if (platform::is_xpu_place(place)) { + VLOG(3) << "Loading data for XPU."; +#if defined(PADDLE_WITH_XPU) + auto xpu_place = place; + memory::Copy(xpu_place, + static_cast(input_tensor_ptr), + platform::CPUPlace(), + input_data.data.data(), + input_data.data.length()); +#else + PADDLE_THROW(paddle::platform::errors::Fatal( + "Paddle wasn't compiled with XPU, but place is XPU.")); #endif } else { PADDLE_THROW(paddle::platform::errors::InvalidArgument( - "DistModel only supports CPU and GPU.")); + "DistModel only supports CPU and GPU and XPU.")); } framework::LoD dst_lod; @@ -189,9 +202,12 @@ bool DistModel::PreparePlace() { place_ = paddle::platform::CUDAPlace(config_.device_id); } else if (config_.place == "CPU") { place_ = paddle::platform::CPUPlace(); + } else if (config_.place == "XPU") { + place_ = paddle::platform::XPUPlace(config_.device_id); } else { PADDLE_THROW(platform::errors::InvalidArgument( - "Place must be choosen from GPU or CPU, but got %s.", config_.place)); + "Place must be choosen from GPU or CPU or XPU, but got %s.", + config_.place)); } return true; } diff --git a/paddle/fluid/distributed/ps/table/depends/dense.h b/paddle/fluid/distributed/ps/table/depends/dense.h index d98a91750f451..3f09376b42db3 100644 --- a/paddle/fluid/distributed/ps/table/depends/dense.h +++ b/paddle/fluid/distributed/ps/table/depends/dense.h @@ -254,9 +254,9 @@ class DAdamD2Sum : public DenseOptimizer { scale = (mat_ada_d2sum + scale).cwiseQuotient(mat_ada_g2sum + scale); scale = scale.cwiseSqrt(); mat_mom_velocity = - (mat_mom_velocity - mat_grad) * mom_decay_rate[0] + mat_grad; + (mat_mom_velocity + mat_grad) * mom_decay_rate[0] - mat_grad; - mat_w -= learning_rate[0] * mat_mom_velocity.cwiseProduct(scale); + mat_w += learning_rate[0] * mat_mom_velocity.cwiseProduct(scale); } float* learning_rate; @@ -299,7 +299,7 @@ class DSummary : public DenseOptimizer { } float* summary_decay_rate; - double summary_decay_rate_d = 0.9999999; + double summary_decay_rate_d = 0.999999; float* param; }; diff --git a/paddle/fluid/distributed/ps/table/sparse_accessor.cc b/paddle/fluid/distributed/ps/table/sparse_accessor.cc index 2fbb58c469ccb..1591e340b9eed 100644 --- a/paddle/fluid/distributed/ps/table/sparse_accessor.cc +++ b/paddle/fluid/distributed/ps/table/sparse_accessor.cc @@ -47,7 +47,6 @@ void SparseAccessor::InitAccessorInfo() { auto embedx_dim = _config.embedx_dim(); _accessor_info.select_dim = 1 + embedx_dim; _accessor_info.select_size = _accessor_info.select_dim * sizeof(float); - ; _accessor_info.update_dim = 4 + embedx_dim; _accessor_info.update_size = _accessor_info.update_dim * sizeof(float); _accessor_info.mf_size = @@ -231,11 +230,13 @@ int32_t SparseAccessor::Update(float** update_values, _embed_sgd_rule->UpdateValue( update_value + sparse_feature_value.EmbedWIndex(), update_value + sparse_feature_value.EmbedG2SumIndex(), - push_value + SparsePushValue::EmbedGIndex()); + push_value + SparsePushValue::EmbedGIndex(), + push_show); _embedx_sgd_rule->UpdateValue( update_value + sparse_feature_value.EmbedxWIndex(), update_value + sparse_feature_value.EmbedxG2SumIndex(), - push_value + SparsePushValue::EmbedxGIndex()); + push_value + SparsePushValue::EmbedxGIndex(), + push_show); } return 0; } diff --git a/paddle/fluid/distributed/the_one_ps.proto b/paddle/fluid/distributed/the_one_ps.proto index e74502d7351b2..2241655465fb9 100755 --- a/paddle/fluid/distributed/the_one_ps.proto +++ b/paddle/fluid/distributed/the_one_ps.proto @@ -120,7 +120,7 @@ message TableParameter { optional bool enable_sparse_table_cache = 10 [ default = true ]; optional double sparse_table_cache_rate = 11 [ default = 0.00055 ]; optional uint32 sparse_table_cache_file_num = 12 [ default = 16 ]; - optional bool enable_revert = 13 [ default = true ]; + optional bool enable_revert = 13 [ default = false ]; optional float shard_merge_rate = 14 [ default = 1.0 ]; } diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 4fee2e5f12728..8fde6951e03ec 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -54,43 +54,15 @@ def SkipAPIGeneration(forward_api_name): # This list contains ops that do not need to generate amp logic # All optimizer ops in this list no_amp_list = [ - 'adam_', - 'adam', - 'adamw_', - 'adamw', - 'average_accumulates', - 'average_accumulates_', - 'decayed_adagrad_', - 'decayed_adagrad', - 'dgc_momentum_', - 'dgc_momentum', - 'distributed_fused_lamb_', - 'distributed_fused_lamb', - 'dpsgd_', - 'dpsgd', - 'ftrl_', - 'ftrl', - 'lamb_', - 'lamb', - 'lars_momentum_', - 'lars_momentum', - 'merged_adam_', - 'merged_adam', - 'merged_momentum_', - 'merged_momentum', - 'momentum_', - 'momentum', - 'proximal_adagrad_', - 'proximal_adagrad', - 'proximal_gd_', - 'proximal_gd', - 'rmsprop_', - 'rmsprop', - 'sgd_', - 'sgd', - 'assign_value_', - 'sparse_momentum_', - 'sparse_momentum', + 'adam_', 'adam', 'adamw_', 'adamw', 'average_accumulates', + 'average_accumulates_', 'decayed_adagrad_', 'decayed_adagrad', + 'dgc_momentum_', 'dgc_momentum', 'distributed_fused_lamb_', + 'distributed_fused_lamb', 'dpsgd_', 'dpsgd', 'ftrl_', 'ftrl', 'lamb_', + 'lamb', 'lars_momentum_', 'lars_momentum', 'merged_adam_', 'merged_adam', + 'merged_momentum_', 'merged_momentum', 'momentum_', 'momentum', + 'proximal_adagrad_', 'proximal_adagrad', 'proximal_gd_', 'proximal_gd', + 'rmsprop_', 'rmsprop', 'sgd_', 'sgd', 'lamb_', 'lamb', 'assign_value_', + 'sparse_momentum_', 'sparse_momentum', 'full_' ] diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index a3c3c2718d99e..b70ec78c7598c 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -360,11 +360,22 @@ std::vector RunBackward( "Node's in-degree cannot be negative.", next_node->name())); - if (node_in_degree_map[next_node] == 0) { - if (dynamic_cast(next_node)) { - queue.push_front(std::move(next_node)); - } else { - queue.push_back(std::move(next_node)); + if (is_general_grad) { + if (node_in_degree_map[next_node] == 0 && + GeneralGrad::Instance().IsNeededNodes(next_node)) { + if (dynamic_cast(next_node)) { + queue.push_front(std::move(next_node)); + } else { + queue.push_back(std::move(next_node)); + } + } + } else { + if (node_in_degree_map[next_node] == 0) { + if (dynamic_cast(next_node)) { + queue.push_front(std::move(next_node)); + } else { + queue.push_back(std::move(next_node)); + } } } } diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc index f8c06a5afff12..e15b91e480a6a 100644 --- a/paddle/fluid/eager/nan_inf_utils.cc +++ b/paddle/fluid/eager/nan_inf_utils.cc @@ -15,6 +15,8 @@ #include "paddle/fluid/eager/nan_inf_utils.h" #include "paddle/fluid/framework/details/nan_inf_utils_detail.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/selected_rows.h" diff --git a/paddle/fluid/eager/nan_inf_utils.h b/paddle/fluid/eager/nan_inf_utils.h index 815e3bd6cd14f..9abc1ca02f168 100644 --- a/paddle/fluid/eager/nan_inf_utils.h +++ b/paddle/fluid/eager/nan_inf_utils.h @@ -62,4 +62,25 @@ void CheckTensorHasNanOrInf( const paddle::small_vector, egr::kSlotSmallVectorSize>& tensors); +template +struct NanInfChecker { + void operator()(const std::string& api_name, const TupleT& tensors) { + CheckTensorHasNanOrInf(api_name, std::get(tensors)); + NanInfChecker()(api_name, tensors); + } +}; + +template +struct NanInfChecker { + void operator()(const std::string& api_name, const TupleT& tensors) { + CheckTensorHasNanOrInf(api_name, std::get(tensors)); + } +}; + +template +void CheckTensorHasNanOrInf(const std::string& api_name, + const TupleT& tensors) { + constexpr size_t size = std::tuple_size::value; + NanInfChecker()(api_name, tensors); +} } // namespace egr diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 8af48ef51db5c..d4d5f4903f863 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -83,26 +83,27 @@ if(WITH_GPU) windows_symbolic(tensor_util SRCS tensor_util.cu) nv_library( tensor - SRCS tensor.cc .tensor_util.cu + SRCS .tensor_util.cu DEPS place memory data_type device_context dense_tensor) add_dependencies(tensor tensor_util) else() nv_library( tensor - SRCS tensor.cc tensor_util.cu - DEPS place memory data_type device_context profiler dense_tensor) + SRCS tensor_util.cu + DEPS place memory data_type device_context dense_tensor) endif() elseif(WITH_ROCM) hip_library( tensor - SRCS tensor.cc tensor_util.cu - DEPS place memory data_type device_context profiler dense_tensor) + SRCS tensor_util.cu + DEPS place memory data_type device_context dense_tensor) else() cc_library( tensor - SRCS tensor.cc tensor_util.cc - DEPS place memory data_type device_context profiler dense_tensor) + SRCS tensor_util.cc + DEPS place memory data_type device_context dense_tensor) endif() +# target_link(tensor profiler) cc_test( tensor_test @@ -437,6 +438,7 @@ if(WITH_XPU) SRCS operator.cc DEPS xpu_op_list op_info + proto_desc device_context tensor scope @@ -461,6 +463,7 @@ else() operator SRCS operator.cc DEPS op_info + proto_desc device_context tensor scope @@ -1167,9 +1170,6 @@ cc_library( op_meta_info phi_api) -#cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ) -#cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) - set(FLUID_FRAMEWORK_MODULES proto_desc memory diff --git a/paddle/fluid/framework/attribute.cc b/paddle/fluid/framework/attribute.cc index 13f175ce0b1cd..dd456b147ac8d 100644 --- a/paddle/fluid/framework/attribute.cc +++ b/paddle/fluid/framework/attribute.cc @@ -42,6 +42,10 @@ paddle::any GetAttrValue(const Attribute& attr) { return PADDLE_GET_CONST(std::vector, attr); case proto::AttrType::FLOAT64S: return PADDLE_GET_CONST(std::vector, attr); + case proto::AttrType::VAR: + return PADDLE_GET_CONST(VarDesc*, attr); + case proto::AttrType::VARS: + return PADDLE_GET_CONST(std::vector, attr); case proto::AttrType::BLOCK: return PADDLE_GET_CONST(BlockDesc*, attr); case proto::AttrType::BLOCKS: diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h index a149c18f542e2..4d3ba2a1820be 100644 --- a/paddle/fluid/framework/attribute.h +++ b/paddle/fluid/framework/attribute.h @@ -221,6 +221,28 @@ inline proto::AttrType AttrTypeID(const Attribute& attr) { return static_cast(attr.index() - 1); } +inline bool IsAttrVar(const Attribute& attr) { + return AttrTypeID(attr) == proto::AttrType::VAR; +} + +inline bool IsAttrVars(const Attribute& attr) { + return AttrTypeID(attr) == proto::AttrType::VARS; +} + +inline bool HasAttrVar(const Attribute& attr) { + return IsAttrVar(attr) || IsAttrVars(attr); +} + +inline AttributeMap FilterAttrVar(const AttributeMap& attrs) { + AttributeMap attrs_var; + for (auto& attr : attrs) { + if (HasAttrVar(attr.second)) { + attrs_var.emplace(attr); + } + } + return attrs_var; +} + class AttrReader { public: explicit AttrReader(const AttributeMap& attrs) @@ -414,9 +436,15 @@ class TypedAttrChecker { } return; } + // If attribute is VarDesc(s), we should verify it's dtype and shape. + auto it = attr_map->find(attr_name_); + if (it != attr_map->end() && HasAttrVar(it->second)) { + VLOG(1) << "Found Attribute " << attr_name_ + << " with Variable, skip attr_checker."; + return; + } if (only_check_exist_value) { - auto it = attr_map->find(attr_name_); if (it != attr_map->end()) { ExtractAttribute extract_attr(attr_name_); T* attr_value = extract_attr(it->second); @@ -425,7 +453,6 @@ class TypedAttrChecker { } } } else { - auto it = attr_map->find(attr_name_); if (it == attr_map->end()) { // user do not set this attr PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/framework/attribute_test.cc b/paddle/fluid/framework/attribute_test.cc index 8a47e41d38359..ab6f71926b80f 100644 --- a/paddle/fluid/framework/attribute_test.cc +++ b/paddle/fluid/framework/attribute_test.cc @@ -19,6 +19,7 @@ #include "gtest/gtest.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_desc.h" #include "paddle/utils/any.h" TEST(Attribute, GetAttrValueToAny) { @@ -72,6 +73,25 @@ TEST(Attribute, GetAttrValueToAny) { EXPECT_EQ(vec_bool[0], true); EXPECT_EQ(vec_bool[1], true); + paddle::framework::VarDesc var_desc("axis"); + paddle::framework::Attribute var_attr(&var_desc); + auto rlt_var_attr = paddle::framework::GetAttrValue(var_attr); + auto var_desc_ptr = + paddle::any_cast(rlt_var_attr); + EXPECT_NE(var_desc_ptr, nullptr); + EXPECT_EQ(var_desc_ptr->Name(), var_desc.Name()); + + paddle::framework::VarDesc var2_desc("prob"); + std::vector vars_desc{&var_desc, &var2_desc}; + paddle::framework::Attribute vars_attr(vars_desc); + + auto rlt_vars_attr = paddle::framework::GetAttrValue(vars_attr); + auto rlt_vars_desc = + paddle::any_cast>(rlt_vars_attr); + EXPECT_EQ(rlt_vars_desc.size(), vars_desc.size()); + EXPECT_EQ(rlt_vars_desc[0]->Name(), vars_desc[0]->Name()); + EXPECT_EQ(rlt_vars_desc[1]->Name(), vars_desc[1]->Name()); + paddle::framework::ProgramDesc prog; paddle::framework::proto::BlockDesc proto_block; paddle::framework::BlockDesc block_desc(&prog, &proto_block); diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc index 84d52c996d056..e8d26f6728260 100644 --- a/paddle/fluid/framework/block_desc.cc +++ b/paddle/fluid/framework/block_desc.cc @@ -176,19 +176,48 @@ std::vector BlockDesc::AllOps() const { } void BlockDesc::Flush() { + auto need_update = NeedUpdate(true); for (auto &op_desc : ops_) { op_desc->Flush(); } - - if (need_update_) { + // no flush for var_desc? or is op_desc flush really needed? + VLOG(10) << "Flush " << NeedUpdate(true) << " " << need_update << std::endl; + if (need_update) { this->desc_->mutable_ops()->Clear(); for (auto &op_desc : ops_) { this->desc_->mutable_ops()->Add()->CopyFrom(*op_desc->Proto()); + // op_desc's need_update is set to false in op_desc->Flush(); + } + + std::vector var_names; + std::set var_names_set; + + // keep order + for (const auto &var : this->desc_->vars()) { + var_names.emplace_back(var.name()); + var_names_set.insert(var.name()); } + this->desc_->mutable_vars()->Clear(); + for (const auto &name : var_names) { + if (vars_.count(name)) { + this->desc_->mutable_vars()->Add()->CopyFrom(*vars_[name]->Proto()); + vars_[name]->SetNeedUpdate(false); + } + } + for (auto &var_desc : vars_) { - this->desc_->mutable_vars()->Add()->CopyFrom(*var_desc.second->Proto()); + if (var_names_set.count(var_desc.first) != 1) { + this->desc_->mutable_vars()->Add()->CopyFrom(*var_desc.second->Proto()); + var_desc.second->SetNeedUpdate(false); + } } + + // this->desc_->mutable_vars()->Clear(); + // for (auto &var_desc : vars_) { + // this->desc_->mutable_vars()->Add()->CopyFrom(*var_desc.second->Proto()); + // var_desc.second->SetNeedUpdate(false); + // } need_update_ = false; } } @@ -207,6 +236,7 @@ BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc) for (const proto::VarDesc &var_desc : desc_->vars()) { vars_[var_desc.name()].reset(new VarDesc(var_desc)); } + for (const proto::OpDesc &op_desc : desc_->ops()) { ops_.emplace_back(new OpDesc(op_desc, this)); } @@ -217,13 +247,15 @@ BlockDesc::BlockDesc(const BlockDesc &other, ProgramDesc *prog) : prog_(prog), desc_(desc) { need_update_ = true; - for (auto &op : other.ops_) { - ops_.emplace_back(new OpDesc(*op, this)); - } + // NOTE(dev): Init vars_ firstly so we can find them + // while constructing OpDesc. for (auto &it : other.vars_) { auto *var = new VarDesc(*it.second); vars_[it.first].reset(var); } + for (auto &op : other.ops_) { + ops_.emplace_back(new OpDesc(*op, this)); + } } void BlockDesc::SetForwardBlockID(int32_t forward_block_id) { @@ -273,7 +305,10 @@ void BlockDesc::MoveFrom(BlockDesc *block) { const auto &attr_name = pair.first; const auto &attr_value = pair.second; auto attr_type = static_cast(attr_value.index() - 1); - if (attr_type == proto::AttrType::BLOCK) { + if (attr_type == proto::AttrType::VAR || + attr_type == proto::AttrType::VARS) { + dst_op->UpdateVarAttr(attr_name, attr_value); + } else if (attr_type == proto::AttrType::BLOCK) { auto block_id = PADDLE_GET_CONST(BlockDesc *, attr_value)->ID(); dst_op->SetBlockAttr(attr_name, prog_->MutableBlock(block_id)); VLOG(10) << "Set block attr " << attr_name << " id " << block_id; @@ -299,5 +334,24 @@ void BlockDesc::MoveFrom(BlockDesc *block) { block->Flush(); } +bool BlockDesc::NeedUpdate(bool include_subs) { + bool need = need_update_; + if (include_subs) { + for (const auto &op : ops_) { + if (op->NeedUpdate()) { + need = true; + break; + } + } + for (const auto &pair : vars_) { + if (pair.second->NeedUpdate()) { + need = true; + break; + } + } + } + return need; +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h index e4e5a71a46c86..bb7227d07163c 100644 --- a/paddle/fluid/framework/block_desc.h +++ b/paddle/fluid/framework/block_desc.h @@ -113,10 +113,13 @@ class BlockDesc { void MoveFrom(BlockDesc *block); + bool NeedUpdate(bool include_subs = true); + private: ProgramDesc *prog_; // not_own proto::BlockDesc *desc_; // not_own - bool need_update_; + bool need_update_; // block itself need_update, not aware of its ops_ and + // vars_ std::deque> ops_; std::unordered_map> vars_; diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h index 226a1db1d3b8f..99186c43e129e 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.h +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h @@ -17,12 +17,9 @@ #include #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/place.h" -namespace phi { -class DenseTensor; -} // namespace phi - namespace paddle { namespace framework { namespace details { diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc index 6c19cf3450dbd..9e3604e71a245 100644 --- a/paddle/fluid/framework/dlpack_tensor_test.cc +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -17,6 +17,8 @@ #include #include +#include "paddle/fluid/platform/device/gpu/gpu_info.h" + namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index 0d3e7c2741c17..391197d967abe 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -36,6 +36,8 @@ enum AttrType { BLOCKS = 10; LONGS = 11; FLOAT64S = 12; + VAR = 13; + VARS = 14; } message ProcessMeshDesc { @@ -65,6 +67,8 @@ message OpDesc { repeated int32 blocks_idx = 14; repeated int64 longs = 15; repeated double float64s = 16; + optional string var_name = 17; + repeated string vars_name = 18; }; message Var { diff --git a/paddle/fluid/operators/fill_diagonal_tensor_op.h b/paddle/fluid/framework/inference_cached_ops.h similarity index 58% rename from paddle/fluid/operators/fill_diagonal_tensor_op.h rename to paddle/fluid/framework/inference_cached_ops.h index f3e41a9c9332c..50444e180718a 100644 --- a/paddle/fluid/operators/fill_diagonal_tensor_op.h +++ b/paddle/fluid/framework/inference_cached_ops.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,21 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - +#include #include -#include "paddle/fluid/framework/op_registry.h" - namespace paddle { -namespace operators { +namespace framework { -void CalMatDims(framework::DDim out_dims, - int dim1, - int dim2, - int64_t *offset, - int64_t *new_dims, - int64_t *strides, - int64_t *matoffset); +// cached ops will be captured to accelerate gpu performance. +// 1. op will generate a cudaGraph to record inner gpu kernels +// 2. inner gpu kernels can be launched by calling the cudagraphExecutor +// only once. +std::vector cached_gpu_ops{"conv2d_fusion", "depthwise_conv2d"}; -} // namespace operators +} // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index c525888ca116c..eb988d59a2a8b 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -451,12 +451,13 @@ CompatInferMetaContext BuildInferMetaContext(InferShapeContext* ctx, auto attr_reader = ctx->Attrs(); for (size_t i = 0; i < attr_names.size(); ++i) { auto& attr_name = attr_names[i]; - VLOG(6) << "BuildInferMetaContext: " << attr_name << ": " - << attr_defs[i].type_index; auto* attr_ptr = attr_reader.GetAttr(attr_name); + bool is_attr_var = attr_ptr != nullptr && HasAttrVar(*attr_ptr); + VLOG(6) << "BuildInferMetaContext: " << attr_name << ": " + << attr_defs[i].type_index << ", is_attr_var: " << is_attr_var; switch (attr_defs[i].type_index) { case phi::AttributeType::SCALAR: - if (attr_ptr) { + if (attr_ptr && !is_attr_var) { auto& attr = *attr_ptr; switch (AttrTypeID(attr)) { case framework::proto::AttrType::FLOAT: @@ -502,7 +503,7 @@ CompatInferMetaContext BuildInferMetaContext(InferShapeContext* ctx, break; case phi::AttributeType::INT_ARRAY: // When attr is a vector_tensor or tensor, transform it to IntArray - if (attr_ptr) { + if (attr_ptr && !is_attr_var) { auto& attr = *attr_ptr; switch (AttrTypeID(attr)) { case framework::proto::AttrType::INTS: diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index bab6cc764afed..3c04bcf539ad2 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -214,15 +214,13 @@ if(WITH_MKLDNN) pass_library(shuffle_channel_mkldnn_detect_pass inference DIR mkldnn) pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn) pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(matmul_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn) pass_library(matmul_activation_mkldnn_fuse_pass inference DIR mkldnn) pass_library(cpu_quantize_placement_pass base DIR mkldnn) pass_library(cpu_quantize_pass inference DIR mkldnn) pass_library(cpu_quantize_squash_pass inference DIR mkldnn) pass_library(reshape_transpose_matmul_mkldnn_fuse_pass inference DIR mkldnn) - pass_library(reshape_transpose_matmul_v2_mkldnn_fuse_pass inference DIR - mkldnn) - pass_library(matmul_transpose_reshape_fuse_pass inference DIR mkldnn) - pass_library(matmul_v2_transpose_reshape_fuse_pass inference DIR mkldnn) + pass_library(matmul_transpose_reshape_mkldnn_fuse_pass inference DIR mkldnn) pass_library(batch_norm_act_fuse_pass inference DIR mkldnn) pass_library(multi_gru_fuse_pass inference DIR mkldnn) pass_library(multi_gru_seq_fuse_pass inference DIR mkldnn) @@ -498,13 +496,11 @@ if(WITH_MKLDNN) cc_test( test_reshape_transpose_matmul_mkldnn_fuse_pass SRCS mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc - DEPS reshape_transpose_matmul_mkldnn_fuse_pass - reshape_transpose_matmul_v2_mkldnn_fuse_pass) + DEPS reshape_transpose_matmul_mkldnn_fuse_pass) cc_test( test_matmul_transpose_reshape_fuse_pass - SRCS mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc - DEPS matmul_transpose_reshape_fuse_pass - matmul_v2_transpose_reshape_fuse_pass) + SRCS mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc + DEPS matmul_transpose_reshape_mkldnn_fuse_pass) cc_test( test_shuffle_channel_mkldnn_detect_pass SRCS mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc diff --git a/paddle/fluid/framework/ir/gpu_cpu_map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/gpu_cpu_map_matmul_to_mul_pass.cc old mode 100644 new mode 100755 index 6edd8c3e4de45..570da9a879fb6 --- a/paddle/fluid/framework/ir/gpu_cpu_map_matmul_to_mul_pass.cc +++ b/paddle/fluid/framework/ir/gpu_cpu_map_matmul_to_mul_pass.cc @@ -356,7 +356,7 @@ void GpuCpuMapMatmulV2ToMulPass::ApplyImpl(ir::Graph* graph) const { size_t x_rank = x_shape.size(); size_t y_rank = y_shape.size(); flag = flag && x_rank >= 2 && y_rank == 2; - + flag = flag && x_shape[x_rank - 1] == y_shape[0]; if (flag) { if (!IsCompat(subgraph, g)) { LOG(WARNING) << "GpuCpuMapMatmulV2ToMulPass in op compat failed."; diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index fceed0fc44e5f..6946fb6d7d9ee 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -149,7 +149,7 @@ std::map> Graph::InitFromBlock( ++desc_order; // For input args, reuse the same var name if it was created before. // Otherwise, create a new one. - for (auto &each_var_name : op->InputArgumentNames()) { + for (auto &each_var_name : op->InputArgumentNames(true)) { not_visited_vars.erase(each_var_name); ir::Node *var = nullptr; if (var_nodes.find(each_var_name) != var_nodes.end()) { diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index ad11656a34170..0ec100704ae7e 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2018,6 +2018,33 @@ PDNode *patterns::ElementwiseOp::operator()( return out_var; } +PDNode *patterns::MatmulElementwiseAdd::operator()( + const std::string &matmul_type, bool as_x) { + auto matmul_op = + pattern->NewNode(matmul_op_repr())->assert_is_op(matmul_type); + auto matmul_out = + pattern->NewNode(matmul_out_repr()) + ->AsIntermediate() + ->assert_is_op_output(matmul_type, "Out") + ->assert_is_only_output_of_op(matmul_type) + ->assert_is_op_input("elementwise_add", as_x ? "X" : "Y"); + auto elementwise_addend = + pattern->NewNode(elementwise_addend_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", as_x ? "Y" : "X"); + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_out = + pattern->NewNode(elementwise_add_out_repr()) + ->AsOutput() + ->assert_is_op_output("elementwise_add", "Out"); + + matmul_op->LinksTo({matmul_out}); + elementwise_add_op->LinksFrom({matmul_out, elementwise_addend}) + .LinksTo({elementwise_add_out}); + return elementwise_add_out; +} + PDNode *patterns::ResidualElementwise::operator()( PDNode *op_var, PDNode *residual_var, diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 80a8f968acde5..a1112383e073f 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -1038,6 +1038,21 @@ struct ElementwiseOp : public PatternBase { PATTERN_DECL_NODE(elementwise_out); }; +struct MatmulElementwiseAdd : public PatternBase { + MatmulElementwiseAdd(PDPattern* pattern, + const std::string& name_scope, + const std::string& matmul_type, + bool as_x) + : PatternBase(pattern, name_scope, "matmul_elementwise_add") {} + + PDNode* operator()(const std::string& matmul_type, bool as_x); + PATTERN_DECL_NODE(matmul_op); + PATTERN_DECL_NODE(matmul_out); + PATTERN_DECL_NODE(elementwise_addend); + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_out); +}; + // Residual Elementwise ops // This pattern allows operator output to be X or Y // and residual data Y or X, based on as_x flag diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc index 2eb35291803f6..a9bc746680c16 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -113,6 +113,7 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConv( if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return; if (!IsReachable(g, residual_data, conv_output)) return; if (HasFusedActivation(conv_op)) return; + if (HasFusedElementwiseAdd(conv_op)) return; if (!IsCompat(subgraph, g)) { LOG(WARNING) @@ -120,6 +121,12 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConv( return; } + if (residual_data->Var()->GetShape() != conv_output->Var()->GetShape()) { + LOG(WARNING) << "conv_elementwise_add_mkldnn_fuse_pass doesn't support " - + "broadcasting"; + return; + } + conv_op->Op()->SetInput("ResidualData", {residual_data->Name()}); conv_op->Op()->SetOutput("Output", {elementwise_out->Name()}); conv_op->Op()->SetAttr("fuse_residual_connection", true); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h index 7c6e9927163c7..86f65480ad1d9 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h @@ -44,6 +44,9 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase { ->GetAttrIfExists("fuse_activation") .empty()); } + static bool HasFusedElementwiseAdd(Node* conv_node) { + return conv_node->Op()->GetAttrIfExists("fuse_residual_connection"); + } const std::string name_scope_{"residual_connection_fuse_pass"}; }; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc index f1c1b57f3f662..d64fbe16a3eb4 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc @@ -199,8 +199,11 @@ class DeQuantizer final : public Quanter { bool IsNotPermittedName(const std::string& output_name) const override { std::unordered_map> block_list{ {"layer_norm", - {"Mean", "Variance"}}, // not used in inference in MKLDNN - {"fc", {"ResidualData"}}}; // artifical output, already dequantized + {"Mean", "Variance"}}, // not used in inference in MKLDNN + {"fc", {"ResidualData"}}, // artifical output, already dequantized + {"matmul", {"ResidualData"}}, // artifical output, already dequantized + {"matmul_v2", + {"ResidualData"}}}; // artifical output, already dequantized std::vector blocked_outputs{"XShape"}; // blocklist for any op auto op_name = op->Name(); diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc index d3f71e498bfe8..9ba89106c3471 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc @@ -26,7 +26,7 @@ using string::PrettyLogDetail; void MatmulActivationMkldnnFusePass::ApplyImpl(Graph* graph) const { auto act_types = paddle::platform::GetSupportedActivations(); - std::vector matmul_types = {"matmul"}; + auto matmul_types = {"matmul", "matmul_v2"}; for (const auto& matmul_type : matmul_types) for (auto& act_type : act_types) { @@ -88,8 +88,9 @@ void MatmulActivationMkldnnFusePass::FuseMatmulAct( gpd(graph, handler); AddStatis(found_matmul_activation_count); if (!Has("disable_logs") || !Get("disable_logs")) { - PrettyLogDetail("--- fused %d matmul with %s activation", + PrettyLogDetail("--- fused %d %s with %s activation", found_matmul_activation_count, + matmul_type, act_type); } } @@ -102,6 +103,11 @@ MatmulActivationMkldnnFusePass::MatmulActivationMkldnnFusePass() { .AddInput("Y") .IsTensor() .End() + .AddInput( + "ResidualData") // Extra tensor used in matmul+elementwise_add fuse + .IsTensor() + .IsOptional() + .End() .AddOutput("Out") .IsTensor() .End() @@ -115,6 +121,28 @@ MatmulActivationMkldnnFusePass::MatmulActivationMkldnnFusePass() { .IsType() .End(); + AddOpCompat(OpCompat("matmul_v2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddInput( + "ResidualData") // Extra tensor used in matmul+elementwise_add fuse + .IsTensor() + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("trans_x") + .IsType() + .End() + .AddAttr("trans_y") + .IsType() + .End(); + AddOpCompat(OpCompat("abs")) .AddInput("X") .IsTensor() @@ -267,6 +295,7 @@ REGISTER_PASS_CAPABILITY(matmul_activation_mkldnn_fuse_pass) .AddCombination( paddle::framework::compatible::OpVersionComparatorCombination() .LE("matmul", 1) + .EQ("matmul_v2", 0) .EQ("abs", 0) .LE("clip", 1) .EQ("gelu", 0) diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc new file mode 100644 index 0000000000000..2e6e450cd4c72 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc @@ -0,0 +1,157 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.h" + +#include "paddle/fluid/framework/ir/graph_traits.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { +namespace ir { + +using string::PrettyLogDetail; + +void MatmulElementwiseAddMKLDNNFusePass::ApplyImpl(Graph* graph) const { + auto matmul_types = {"matmul", "matmul_v2"}; + auto matmul_as_x = {true, false}; + + for (const auto& matmul_type : matmul_types) + for (const auto& as_x : matmul_as_x) { + FuseMatmulElementwiseAdd(graph, matmul_type, as_x); + } +} + +void MatmulElementwiseAddMKLDNNFusePass::FuseMatmulElementwiseAdd( + Graph* graph, const std::string& matmul_type, bool matmul_as_x) const { + const std::string fusion_mode = matmul_as_x ? "x" : "y"; + const auto name_scope = matmul_type + "_elementwise_add_as_" + fusion_mode; + FusePassBase::Init(name_scope, graph); + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + patterns::MatmulElementwiseAdd matmul_pattern( + pattern, name_scope, matmul_type, matmul_as_x); + matmul_pattern(matmul_type, matmul_as_x); + + int found_matmul_elementwise_add_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(matmul, matmul_op, matmul_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, matmul_pattern); + GET_IR_NODE_FROM_SUBGRAPH( + elementwise_add, elementwise_add_op, matmul_pattern); + GET_IR_NODE_FROM_SUBGRAPH( + elementwise_addend, elementwise_addend, matmul_pattern); + GET_IR_NODE_FROM_SUBGRAPH( + elementwise_add_out, elementwise_add_out, matmul_pattern); + + if (FindFuseOption(*matmul, *elementwise_add) != FUSE_MKLDNN) return; + if (!IsCompat(subgraph, g)) { + LOG(WARNING) + << "op compat for matmul_elementwise_add_mkldnn_fuse_pass failed."; + return; + } + if (matmul->Op()->HasAttr("ResidualData")) { + LOG(WARNING) << "matmul_elementwise_add can be fused once"; + return; + } + + matmul->Op()->SetInput("ResidualData", {elementwise_addend->Name()}); + matmul->Op()->SetOutput("Out", {elementwise_add_out->Name()}); + + GraphSafeRemoveNodes(g, {matmul_out, elementwise_add}); + + IR_NODE_LINK_TO(elementwise_addend, matmul); + IR_NODE_LINK_TO(matmul, elementwise_add_out); + + found_matmul_elementwise_add_count++; + }; + + gpd(graph, handler); + AddStatis(found_matmul_elementwise_add_count); + if (!Has("disable_logs") || !Get("disable_logs")) { + PrettyLogDetail("--- fused %d %s (as %s) with elementwise_add", + found_matmul_elementwise_add_count, + matmul_type, + fusion_mode); + } +} + +MatmulElementwiseAddMKLDNNFusePass::MatmulElementwiseAddMKLDNNFusePass() { + AddOpCompat(OpCompat("matmul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("alpha") + .IsType() + .End() + .AddAttr("transpose_X") + .IsType() + .End() + .AddAttr("transpose_Y") + .IsType() + .End(); + + AddOpCompat(OpCompat("matmul_v2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("trans_x") + .IsType() + .End() + .AddAttr("trans_y") + .IsType() + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsIntIn({-1, 0, 1}) + .End(); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(matmul_elementwise_add_mkldnn_fuse_pass, + paddle::framework::ir::MatmulElementwiseAddMKLDNNFusePass); +REGISTER_PASS_CAPABILITY(matmul_elementwise_add_mkldnn_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("matmul", 1) + .EQ("matmul_v2", 0) + .LE("elementwise_add", 1)); diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.h similarity index 57% rename from paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.h rename to paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.h index 7eeda7f1a61a4..c630fd0b8741e 100644 --- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.h @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,26 +14,25 @@ #pragma once -#include - -#include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h" +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { namespace framework { namespace ir { -/* - * Fuse Reshape->Transpose->MatMulV2 when MatMulV2 uses mkldnn. - */ -class ReshapeTransposeMatmulV2MkldnnFusePass - : public ReshapeTransposeMatmulMkldnnFusePass { +class MatmulElementwiseAddMKLDNNFusePass : public FusePassBase { public: - ReshapeTransposeMatmulV2MkldnnFusePass(); - virtual ~ReshapeTransposeMatmulV2MkldnnFusePass() {} + MatmulElementwiseAddMKLDNNFusePass(); + virtual ~MatmulElementwiseAddMKLDNNFusePass() {} protected: - const std::string name_scope_{"reshape_transpose_matmul_v2_fuse"}; + void ApplyImpl(Graph* graph) const; + void FuseMatmulElementwiseAdd(Graph* graph, + const std::string& matmul_type, + bool matmul_as_x) const; }; + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc similarity index 70% rename from paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc rename to paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc index 09bf9c57c4728..ce892aa86838a 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,12 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h" - +#include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.h" #include - -#include - #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" @@ -25,76 +21,28 @@ namespace paddle { namespace framework { namespace ir { -MatmulTransposeReshapeMKLDNNPass::MatmulTransposeReshapeMKLDNNPass() { - op_name_ = "matmul"; +using string::PrettyLogDetail; - AddOpCompat(OpCompat(op_name_)) - .AddInput("X") - .IsTensor() - .End() - .AddInput("Y") - .IsTensor() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddAttr("alpha") // unconstrained. can be any float value. - .IsType() - .End() - .AddAttr("transpose_X") // unconstrained. can be any bool value. - .IsType() - .End() - .AddAttr("transpose_Y") // unconstrained. can be any bool value. - .IsType() - .End(); +void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(Graph *graph) const { + auto matmul_types = {"matmul", "matmul_v2"}; - AddOpCompat(OpCompat("transpose2")) - .AddInput("X") - .IsTensor() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddOutput("XShape") - .IsTensor() - .End() - .AddAttr("axis") // ints - .IsType>() - .End(); - - AddOpCompat(OpCompat("reshape2")) - .AddInput("X") - .IsTensor() - .End() - .AddInput("Shape") - .IsTensor() - .IsOptional() - .End() - .AddInput("ShapeTensor") - .IsTensor() - .IsOptional() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddOutput("XShape") - .IsTensor() - .End() - .AddAttr("shape") // ints - .IsType>() - .End(); + for (const auto &matmul_type : matmul_types) { + Fuse(graph, matmul_type); + } } -void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const { + +void MatmulTransposeReshapeMKLDNNPass::Fuse( + Graph *graph, const std::string &matmul_type) const { PADDLE_ENFORCE_NOT_NULL(graph, platform::errors::InvalidArgument( "Pointer to graph argument should not be NULL.")); - FusePassBase::Init(name_scope_, graph); - + FusePassBase::Init(matmul_type + "_transpose_reshape_mkldnn_fuse_pass", + graph); GraphPatternDetector gpd; - patterns::MatmulTransposeReshapePattern mtrp(gpd.mutable_pattern(), - name_scope_); - - mtrp(op_name_); + patterns::MatmulTransposeReshapePattern mtrp( + gpd.mutable_pattern(), + matmul_type + "_transpose_reshape_mkldnn_fuse_pass"); + mtrp(matmul_type); int found_matmul_transpose_reshape_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, @@ -103,7 +51,7 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const { LOG(WARNING) << "Pass in op compat failed."; return; } - VLOG(4) << "handle " + op_name_ + "_transpose_reshape fuse"; + GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, mtrp); GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, mtrp); GET_IR_NODE_FROM_SUBGRAPH(transpose_op, transpose_op, mtrp); @@ -112,6 +60,7 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const { GET_IR_NODE_FROM_SUBGRAPH(reshape_op, reshape_op, mtrp); GET_IR_NODE_FROM_SUBGRAPH(reshape_out, reshape_out, mtrp); GET_IR_NODE_FROM_SUBGRAPH(reshape_out_xshape, reshape_out_xshape, mtrp); + auto reshape_shape = PADDLE_GET_CONST(std::vector, reshape_op->Op()->GetAttr("shape")); auto transpose_axis = @@ -123,17 +72,17 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const { const bool supported_transpose_axis = std::equal( transpose_axis.begin(), transpose_axis.end(), supported_axis.begin()); if (transpose_out_size != 4) { - VLOG(3) << "do not perform " + op_name_ + "_transpose_reshape fuse: " + VLOG(3) << "do not perform " + matmul_type + "_transpose_reshape fuse: " << "supported rank is 4, received " << transpose_out_size; return; } if (!supported_transpose_axis) { - VLOG(3) << "do not perform " + op_name_ + "_transpose_reshape fuse: " + VLOG(3) << "do not perform " + matmul_type + "_transpose_reshape fuse: " << "supported transpose axis for the fuse are {0, 2, 1, 3}"; return; } if (reshape_out_size != 3) { - VLOG(3) << "do not perform " + op_name_ + "_transpose_reshape fuse: " + VLOG(3) << "do not perform " + matmul_type + "_transpose_reshape fuse: " << "reshape_out supported rank is 3, received " << reshape_out_size; return; @@ -158,23 +107,93 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const { gpd(graph, handler); AddStatis(found_matmul_transpose_reshape_count); - if (!Has("disable_logs") || !Get("disable_logs")) { - std::stringstream msg_ss; - msg_ss << "--- Fused " << found_matmul_transpose_reshape_count - << " MatmulTransposeReshape patterns for " + op_name_ + " Op"; - paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + if ((!Has("disable_logs") || !Get("disable_logs")) && + found_matmul_transpose_reshape_count > 0) { + PrettyLogDetail("--- fused %d %s + transpose + reshape patterns", + found_matmul_transpose_reshape_count, + matmul_type); } } + +MatmulTransposeReshapeMKLDNNPass::MatmulTransposeReshapeMKLDNNPass() { + AddOpCompat(OpCompat("matmul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("alpha") + .IsType() + .End() + .AddAttr("transpose_X") + .IsType() + .End() + .AddAttr("transpose_Y") + .IsType() + .End(); + + AddOpCompat(OpCompat("matmul_v2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("trans_x") + .IsType() + .End() + .AddAttr("trans_y") + .IsType() + .End(); + + AddOpCompat(OpCompat("transpose2")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddAttr("axis") + .IsType>() + .End(); + + AddOpCompat(OpCompat("reshape2")) + .AddInput("X") + .IsTensor() + .End() + // The reshape2 op for this pass should not have "Shape" and "ShapeTensor" + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddAttr("shape") + .IsType>() + .End(); +} + } // namespace ir } // namespace framework } // namespace paddle -REGISTER_PASS(matmul_transpose_reshape_fuse_pass, +REGISTER_PASS(matmul_transpose_reshape_mkldnn_fuse_pass, paddle::framework::ir::MatmulTransposeReshapeMKLDNNPass); -REGISTER_PASS_CAPABILITY(matmul_transpose_reshape_fuse_pass) +REGISTER_PASS_CAPABILITY(matmul_transpose_reshape_mkldnn_fuse_pass) .AddCombination( paddle::framework::compatible::OpVersionComparatorCombination() .LE("matmul", 1) - .EQ("transpose", 0) - .EQ("reshape", 0)); + .EQ("matmul_v2", 0) + .EQ("transpose2", 0) + .EQ("reshape2", 0)); diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.h similarity index 80% rename from paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h rename to paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.h index e03746e6e80e8..36bc97876ce73 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.h @@ -1,4 +1,4 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,14 +14,11 @@ #pragma once -#include - #include "paddle/fluid/framework/ir/fuse_pass_base.h" namespace paddle { namespace framework { namespace ir { -class Graph; class MatmulTransposeReshapeMKLDNNPass : public FusePassBase { public: @@ -29,10 +26,10 @@ class MatmulTransposeReshapeMKLDNNPass : public FusePassBase { virtual ~MatmulTransposeReshapeMKLDNNPass() {} protected: - void ApplyImpl(Graph* graph) const override; - const std::string name_scope_{"matmul_transpose_reshape_fuse"}; - std::string op_name_; + void ApplyImpl(Graph *graph) const override; + void Fuse(Graph *graph, const std::string &matmul_type) const; }; + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc similarity index 92% rename from paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc rename to paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc index 75cc3e12c2e9f..4149bb2347317 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass_tester.cc @@ -14,7 +14,7 @@ #include -#include "paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.h" +#include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_mkldnn_fuse_pass.h" namespace paddle { namespace framework { @@ -74,7 +74,7 @@ void MainTest(const ProgramDesc &prog, const std::string &op_name) { int original_nodes_num = graph->Nodes().size(); auto pass = - PassRegistry::Instance().Get(op_name + "_transpose_reshape_fuse_pass"); + PassRegistry::Instance().Get("matmul_transpose_reshape_mkldnn_fuse_pass"); graph.reset(pass->Apply(graph.release())); int current_nodes_num = graph->Nodes().size(); @@ -106,5 +106,4 @@ TEST(MatmulTransposeReshapeFusePass, matmul_v2_fuse_pass) { } // namespace framework } // namespace paddle -USE_PASS(matmul_transpose_reshape_fuse_pass); -USE_PASS(matmul_v2_transpose_reshape_fuse_pass); +USE_PASS(matmul_transpose_reshape_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index a3b1f730dfc24..d78c0c4356266 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -20,17 +20,18 @@ #include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(leaky_relu); -USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); +PD_DECLARE_KERNEL(leaky_relu, OneDNN, ALL_LAYOUT); USE_OP_ITSELF(gelu); USE_OP_ITSELF(relu); USE_OP_ITSELF(tanh); -USE_OP_DEVICE_KERNEL(tanh, MKLDNN); +PD_DECLARE_KERNEL(tanh, OneDNN, ALL_LAYOUT); PD_DECLARE_ARG_MAPPING_FN(gelu); namespace paddle { diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc index 20bfe5726f659..29e013c55a40b 100644 --- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,11 +13,6 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h" - -#include -#include -#include - #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/string/pretty_log.h" @@ -26,78 +21,46 @@ namespace paddle { namespace framework { namespace ir { -ReshapeTransposeMatmulMkldnnFusePass::ReshapeTransposeMatmulMkldnnFusePass() { - op_name_ = "matmul"; - - AddOpCompat(OpCompat("reshape2")) - .AddInput("X") - .IsTensor() - .End() - // The reshape2 op for this pass should not have "Shape" and "ShapeTensor" - .AddOutput("Out") - .IsTensor() - .End() - .AddOutput("XShape") - .IsOptional() - .IsTensor() - .End() - .AddAttr("shape") - .IsType>() - .End(); - - AddOpCompat(OpCompat("transpose2")) - .AddInput("X") - .IsTensor() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddOutput("XShape") - .IsOptional() - .IsTensor() - .End() - .AddAttr("axis") - .IsType>() - .End(); +void ReshapeTransposeMatmulMkldnnFusePass::ApplyImpl(Graph *graph) const { + auto matmul_types = {"matmul", "matmul_v2"}; + bool with_reshape_xshape = true; + bool with_transpose_xshape = true; - AddOpCompat(OpCompat(op_name_)) - .AddInput("X") - .IsTensor() - .End() - .AddInput("Y") - .IsTensor() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddAttr("alpha") - .IsType() - .End() - .AddAttr("transpose_X") - .IsType() - .End() - .AddAttr("transpose_Y") - .IsType() - .End(); + for (const auto &matmul_type : matmul_types) { + Fuse(graph, matmul_type, with_reshape_xshape, with_transpose_xshape); + Fuse(graph, matmul_type, with_reshape_xshape, !with_transpose_xshape); + Fuse(graph, matmul_type, !with_reshape_xshape, with_transpose_xshape); + Fuse(graph, matmul_type, !with_reshape_xshape, !with_transpose_xshape); + } } void ReshapeTransposeMatmulMkldnnFusePass::Fuse( - Graph *graph, bool with_reshape_xshape, bool with_transpose_xshape) const { + Graph *graph, + const std::string &matmul_type, + bool with_reshape_xshape, + bool with_transpose_xshape) const { + PADDLE_ENFORCE_NOT_NULL(graph, + platform::errors::InvalidArgument( + "Pointer to graph argument should not be NULL.")); + FusePassBase::Init("reshape_transpose_" + matmul_type + "_mkldnn_fuse_pass", + graph); + GraphPatternDetector gpd; - patterns::ReshapeTransposeMatmulPattern rtm_pattern(gpd.mutable_pattern(), - name_scope_); + patterns::ReshapeTransposeMatmulPattern rtm_pattern( + gpd.mutable_pattern(), + "reshape_transpose_" + matmul_type + "_mkldnn_fuse_pass"); - rtm_pattern(op_name_, with_reshape_xshape, with_transpose_xshape); + rtm_pattern(matmul_type, with_reshape_xshape, with_transpose_xshape); int found_reshape_transpose_matmul_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, Graph *g) { if (!IsCompat(subgraph, g)) { - LOG(WARNING) << "Op compatible check in reshape_transpose_" << op_name_ + LOG(WARNING) << "Op compatible check in reshape_transpose_" << matmul_type << "_mkldnn_fuse_pass failed."; return; } - VLOG(4) << "handle reshape_transpose_" << op_name_ << " fuse"; + GET_IR_NODE_FROM_SUBGRAPH(reshape_in, reshape_in, rtm_pattern); GET_IR_NODE_FROM_SUBGRAPH(reshape_op, reshape_op, rtm_pattern); GET_IR_NODE_FROM_SUBGRAPH(reshape_out, reshape_out, rtm_pattern); @@ -137,7 +100,7 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse( UpdateMatmul("Y"); } else { throw platform::errors::InvalidArgument("Unexpected input to " + - op_name_ + " encountered."); + matmul_type + " encountered."); } std::unordered_set nodes_to_remove{ @@ -153,26 +116,85 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse( gpd(graph, handler); AddStatis(found_reshape_transpose_matmul_count); - if (!Has("disable_logs") || !Get("disable_logs")) { + if ((!Has("disable_logs") || !Get("disable_logs")) && + found_reshape_transpose_matmul_count > 0) { std::stringstream msg_ss; - msg_ss << "--- Fused " << found_reshape_transpose_matmul_count - << " ReshapeTransposeMatmul patterns for " << op_name_ << " Op"; + msg_ss << "--- fused " << found_reshape_transpose_matmul_count + << " reshape + transpose + " << matmul_type; if (with_reshape_xshape) msg_ss << " with reshape's xshape"; if (with_transpose_xshape) msg_ss << " with transpose's xshape"; string::PrettyLogDetail(msg_ss.str().c_str()); } } -void ReshapeTransposeMatmulMkldnnFusePass::ApplyImpl(ir::Graph *graph) const { - PADDLE_ENFORCE_NOT_NULL(graph, - platform::errors::InvalidArgument( - "Pointer to graph argument should not be NULL.")); - FusePassBase::Init(name_scope_, graph); +ReshapeTransposeMatmulMkldnnFusePass::ReshapeTransposeMatmulMkldnnFusePass() { + AddOpCompat(OpCompat("reshape2")) + .AddInput("X") + .IsTensor() + .End() + // The reshape2 op for this pass should not have "Shape" and "ShapeTensor" + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsOptional() + .IsTensor() + .End() + .AddAttr("shape") + .IsType>() + .End(); + + AddOpCompat(OpCompat("transpose2")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsOptional() + .IsTensor() + .End() + .AddAttr("axis") + .IsType>() + .End(); + + AddOpCompat(OpCompat("matmul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("alpha") + .IsType() + .End() + .AddAttr("transpose_X") + .IsType() + .End() + .AddAttr("transpose_Y") + .IsType() + .End(); - Fuse(graph, false, false); - Fuse(graph, false, true); - Fuse(graph, true, false); - Fuse(graph, true, true); + AddOpCompat(OpCompat("matmul_v2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("trans_x") + .IsType() + .End() + .AddAttr("trans_y") + .IsType() + .End(); } } // namespace ir @@ -184,5 +206,8 @@ REGISTER_PASS(reshape_transpose_matmul_mkldnn_fuse_pass, REGISTER_PASS_CAPABILITY(reshape_transpose_matmul_mkldnn_fuse_pass) .AddCombination( - paddle::framework::compatible::OpVersionComparatorCombination().EQ( - "matmul", 1)); + paddle::framework::compatible::OpVersionComparatorCombination() + .EQ("reshape2", 0) + .EQ("transpose2", 0) + .EQ("matmul", 1) + .EQ("matmul_v2", 0)); diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h index 187bfe0650a64..4b595837b23a7 100644 --- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h @@ -1,4 +1,4 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,17 +13,11 @@ // limitations under the License. #pragma once - -#include - #include "paddle/fluid/framework/ir/fuse_pass_base.h" namespace paddle { namespace framework { namespace ir { -/* - * Fuse Reshape->Transpose->MatMul when MatMul uses mkldnn. - */ class ReshapeTransposeMatmulMkldnnFusePass : public FusePassBase { public: @@ -31,13 +25,11 @@ class ReshapeTransposeMatmulMkldnnFusePass : public FusePassBase { virtual ~ReshapeTransposeMatmulMkldnnFusePass() {} protected: - void ApplyImpl(ir::Graph* graph) const override; - const std::string name_scope_{"reshape_transpose_matmul_fuse"}; - + void ApplyImpl(Graph* graph) const override; void Fuse(Graph* graph, + const std::string& matmul_type, bool with_reshape_xshape, bool with_transpose_xshape) const; - std::string op_name_; }; } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc index 369ceec934ed7..79164a32098b2 100644 --- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc @@ -15,7 +15,6 @@ #include #include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h" -#include "paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_v2_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" namespace paddle { @@ -82,8 +81,8 @@ void TestMain(const std::string& op_name, bool with_xshapes) { int total_nodes_before = graph->Nodes().size(); VLOG(3) << DebugString(graph); - auto pass = PassRegistry::Instance().Get("reshape_transpose_" + op_name + - "_mkldnn_fuse_pass"); + auto pass = + PassRegistry::Instance().Get("reshape_transpose_matmul_mkldnn_fuse_pass"); graph.reset(pass->Apply(graph.release())); int num_reshape_nodes_after = GetNumOpNodes(graph, "reshape2"); @@ -137,4 +136,3 @@ TEST(ReshapeTransposeMatmulV2MkldnnFusePass, } // namespace paddle USE_PASS(reshape_transpose_matmul_mkldnn_fuse_pass); -USE_PASS(reshape_transpose_matmul_v2_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/ir/swin_attention1_fuse_pass.cc b/paddle/fluid/framework/ir/swin_attention1_fuse_pass.cc index 8b51e6dffff9e..d74edad34a845 100644 --- a/paddle/fluid/framework/ir/swin_attention1_fuse_pass.cc +++ b/paddle/fluid/framework/ir/swin_attention1_fuse_pass.cc @@ -93,6 +93,14 @@ void SwinAttention1FusePass::ApplyImpl(ir::Graph* graph) const { auto bias_qkv_dims = phi::make_ddim({3, bias_qkv_tensor->dims()[0]/3}); bias_qkv_tensor->Resize(bias_qkv_dims); + auto * bias_qk_1_var = scope->FindVar(elementwise_70_in_y->Name()); + auto* bias_qk_1_tensor = bias_qk_1_var->GetMutable(); + auto bias_qk_1_dims = bias_qk_1_tensor->dims(); + auto* bias_qk_1_data = bias_qk_1_tensor->mutable_data(platform::CPUPlace()); + printf("@@@ in pass biasqk 0: %f ",bias_qk_1_data[0]); + VLOG(0)<<"@@@ bias_qk_1_tensor:"; + VLOG(0)< softmax_shape=softmax_80_out->Var()->GetShape(); float alpha=PADDLE_GET_CONST(float,scale_50_op->Op()->GetAttr("scale")); diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt index f72a8157970b5..e01fcb68fbf3b 100644 --- a/paddle/fluid/framework/new_executor/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/CMakeLists.txt @@ -39,15 +39,10 @@ set(STANDALONE_EXECUTOR_DEPS scope glog workqueue - interpretercore_event_garbage_collector + interpretercore_garbage_collector ${DEVICE_EVENT_LIBS} glog) -if(WITH_GPU OR WITH_ROCM) - set(STANDALONE_EXECUTOR_DEPS ${STANDALONE_EXECUTOR_DEPS} - interpretercore_fast_garbage_collector) -endif() - cc_library( standalone_executor SRCS ${STANDALONE_EXECUTOR_SRCS} diff --git a/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt b/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt index 359c56c561a4d..d7ff6e4d50f20 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt @@ -1,22 +1,5 @@ cc_library( interpretercore_garbage_collector - SRCS garbage_collector.cc + SRCS garbage_collector.cc event_garbage_collector.cc fast_garbage_collector.cc + no_event_garbage_collector.cc DEPS garbage_collector) -cc_library( - interpretercore_event_garbage_collector - SRCS event_garbage_collector.cc - DEPS interpretercore_garbage_collector) - -if(WITH_GPU OR WITH_ROCM) - if(WITH_GPU) - nv_library( - interpretercore_fast_garbage_collector - SRCS fast_garbage_collector.cc - DEPS interpretercore_garbage_collector) - elseif(WITH_ROCM) - hip_library( - interpretercore_fast_garbage_collector - SRCS fast_garbage_collector.cc - DEPS interpretercore_garbage_collector) - endif() -endif() diff --git a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc index 5f3386d52da65..6133d6ece8d06 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc +++ b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc @@ -24,48 +24,33 @@ namespace paddle { namespace framework { -InterpreterCoreEventGarbageCollector::InterpreterCoreEventGarbageCollector() { +InterpreterCoreEventGarbageCollector::InterpreterCoreEventGarbageCollector( + const std::vector& vec_instruction) { WorkQueueOptions options(/*name*/ "GarbageCollector", /*num_threads*/ 1, /*allow_spinning*/ true, /*track_task*/ false); queue_ = CreateSingleThreadedWorkQueue(options); + for (auto& instruc : vec_instruction) { + gc_event_.emplace_back(instruc.DeviceContext().GetPlace(), + platform::GenerateDeviceEventFlag()); + } } InterpreterCoreEventGarbageCollector::~InterpreterCoreEventGarbageCollector() { queue_.reset(nullptr); } -void InterpreterCoreEventGarbageCollector::Add( - Garbage garbage, - platform::DeviceEvent* event, - const platform::DeviceContext* ctx) { - if (!garbage) { - return; - } - - if (max_memory_size_ <= 1) { - Free(garbage, event, ctx); - } else { - std::unique_ptr pending_delete_garbages; - { // lock guard - std::lock_guard guard(spinlock_); - cur_memory_size_ += garbage->size(); - garbages_->push_back(std::move(garbage)); - - if (cur_memory_size_ >= max_memory_size_) { - cur_memory_size_ = 0; - pending_delete_garbages = std::move(garbages_); - garbages_ = std::make_unique(); - } - } - } -} - -void InterpreterCoreEventGarbageCollector::Add(Variable* var) { - PADDLE_THROW(platform::errors::Unimplemented( - "Add(Variable* var) is not implemented for " - "InterpreterCoreEventGarbageCollector.")); +void InterpreterCoreEventGarbageCollector::Add(Variable* var, + const Instruction& instr) { + PADDLE_ENFORCE_LT(instr.Id(), + gc_event_.size(), + platform::errors::OutOfRange( + "The index should be less than the size of gc event " + ", but got index is %d and size is %d", + instr.Id(), + gc_event_.size())); + Add(var, &gc_event_.at(instr.Id()), &instr.DeviceContext()); } void InterpreterCoreEventGarbageCollector::Add( @@ -109,23 +94,28 @@ void InterpreterCoreEventGarbageCollector::Add( } } -void InterpreterCoreEventGarbageCollector::Free( - GarbageQueue* garbages, +void InterpreterCoreEventGarbageCollector::Add( + Garbage garbage, platform::DeviceEvent* event, const platform::DeviceContext* ctx) { - event->Record(ctx); - event->SetFininshed(); // Only for CPU Event - queue_->AddTask([container = garbages, event = event]() { - while (!event->Query()) { -#if defined(_WIN32) - SleepEx(50, FALSE); -#else - sched_yield(); -#endif - continue; + if (!garbage) { + return; + } + + if (max_memory_size_ <= 1) { + Free(garbage, event, ctx); + } else { + { // lock guard + std::lock_guard guard(spinlock_); + cur_memory_size_ += garbage->size(); + garbages_->push_back(std::move(garbage)); + events_[ctx] = event; + + if (cur_memory_size_ >= max_memory_size_) { + FreeGarbages(); + } } - delete container; - }); + } } void InterpreterCoreEventGarbageCollector::Free( @@ -146,5 +136,28 @@ void InterpreterCoreEventGarbageCollector::Free( }); } +void InterpreterCoreEventGarbageCollector::FreeGarbages() { + for (auto& vals : events_) { + vals.second->Record(vals.first); + vals.second->SetFininshed(); // Only for CPU Event + } + queue_->AddTask( + [container = std::move(*garbages_), events = std::move(events_)]() { + for (auto& vals : events) { + while (!vals.second->Query()) { +#if defined(_WIN32) + SleepEx(50, FALSE); +#else + sched_yield(); +#endif + continue; + } + } + }); + cur_memory_size_ = 0; + garbages_->clear(); + events_.clear(); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h index 415aa5a96db55..305dbb598b2cf 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h +++ b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h @@ -24,28 +24,31 @@ namespace framework { class InterpreterCoreEventGarbageCollector : public InterpreterCoreGarbageCollector { public: - InterpreterCoreEventGarbageCollector(); + InterpreterCoreEventGarbageCollector( + const std::vector& vec_instruction); ~InterpreterCoreEventGarbageCollector(); - - void Add(Variable* var) override; - - virtual void Add(Variable* var, - platform::DeviceEvent* event, - const platform::DeviceContext* ctx); + void Add(Variable* var, const Instruction& instruction) override; private: + void Add(Variable* var, + platform::DeviceEvent* event, + const platform::DeviceContext* ctx); void Add(Garbage garbage, platform::DeviceEvent* event, const platform::DeviceContext* ctx); - void Free(GarbageQueue* garbages, - platform::DeviceEvent* event, - const platform::DeviceContext* ctx); + void Free(const Garbage& garbage, platform::DeviceEvent* event, const platform::DeviceContext* ctx); + void FreeGarbages(); + std::unique_ptr queue_; paddle::memory::SpinLock spinlock_; + std::vector gc_event_; + std::unordered_map + events_; }; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc index 762e211bcb747..f6ed094887b59 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc +++ b/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc @@ -17,12 +17,9 @@ namespace paddle { namespace framework { -void InterpreterCoreFastGarbageCollector::Add( - Variable* var, - platform::DeviceEvent* event, - const platform::DeviceContext* ctx) { - PADDLE_THROW(platform::errors::Unimplemented( - "Not implemented for InterpreterCoreFastGarbageCollector.")); +void InterpreterCoreFastGarbageCollector::Add(Variable* var, + const Instruction&) { + Add(var); } void InterpreterCoreFastGarbageCollector::Add(Variable* var) { diff --git a/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.h index 6b5fd33f68317..07034a4f29983 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.h +++ b/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.h @@ -13,8 +13,6 @@ // limitations under the License. #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - #include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h" namespace paddle { @@ -23,15 +21,11 @@ namespace framework { class InterpreterCoreFastGarbageCollector : public InterpreterCoreGarbageCollector { public: - void Add(Variable* var) override; - void Add(Variable* var, - platform::DeviceEvent* event, - const platform::DeviceContext* ctx) override; + void Add(Variable* var, const Instruction& instr) override; private: + void Add(Variable* var); void Add(Garbage garbage); }; } // namespace framework } // namespace paddle - -#endif diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc index 8e849c79bd235..e7e925a47797f 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc +++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.cc @@ -13,17 +13,48 @@ // limitations under the License. #include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h" - #include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h" +#include "paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.h" +#include "paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h" + +DECLARE_bool(fast_eager_deletion_mode); namespace paddle { namespace framework { +bool IsInterpretercoreFastGCEnabled() { + return memory::allocation::AllocatorFacade::Instance() + .IsStreamSafeCUDAAllocatorUsed() && + FLAGS_fast_eager_deletion_mode; +} + InterpreterCoreGarbageCollector::InterpreterCoreGarbageCollector() { garbages_ = std::make_unique(); max_memory_size_ = static_cast(GetEagerDeletionThreshold()); cur_memory_size_ = 0; } +std::unique_ptr +CreateInterpreterCoreGarbageCollector( + const platform::Place& place, + const std::vector& vec_instruction) { + if (platform::is_gpu_place(place)) { + if (IsInterpretercoreFastGCEnabled()) { + return std::unique_ptr( + new InterpreterCoreFastGarbageCollector()); + } else { + return std::unique_ptr( + new InterpreterCoreEventGarbageCollector(vec_instruction)); + } + } else if (platform::is_xpu_place(place) || platform::is_ipu_place(place)) { + return std::unique_ptr( + new InterpreterCoreNoEventGarbageCollector()); + } else { + return std::unique_ptr( + new InterpreterCoreEventGarbageCollector(vec_instruction)); + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h index c0397ceeb6d87..2e8e1792cd139 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h +++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h @@ -15,6 +15,7 @@ #include +#include "paddle/fluid/framework/new_executor/new_executor_defs.h" #include "paddle/fluid/memory/allocation/spin_lock.h" #include "paddle/fluid/platform/device_event.h" #include "paddle/fluid/platform/enforce.h" @@ -30,10 +31,9 @@ class InterpreterCoreGarbageCollector { public: InterpreterCoreGarbageCollector(); virtual ~InterpreterCoreGarbageCollector() {} - virtual void Add(Variable* var) = 0; - virtual void Add(Variable* var, - platform::DeviceEvent* event, - const platform::DeviceContext* ctx) = 0; + + virtual void Add(Variable* var, const Instruction& instruction) = 0; + DISABLE_COPY_AND_ASSIGN(InterpreterCoreGarbageCollector); protected: @@ -43,5 +43,12 @@ class InterpreterCoreGarbageCollector { memory::SpinLock spinlock_; }; +bool IsInterpretercoreFastGCEnabled(); + +std::unique_ptr +CreateInterpreterCoreGarbageCollector( + const platform::Place& place, + const std::vector& vec_instruction); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.cc new file mode 100644 index 0000000000000..bbe7659ab0cc6 --- /dev/null +++ b/paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h" + +namespace paddle { +namespace framework { + +InterpreterCoreNoEventGarbageCollector:: + InterpreterCoreNoEventGarbageCollector() { + WorkQueueOptions options(/*name*/ "NoEventGarbageCollector", + /*num_threads*/ 1, + /*allow_spinning*/ true, + /*track_task*/ false); + queue_ = CreateSingleThreadedWorkQueue(options); +} + +InterpreterCoreNoEventGarbageCollector:: + ~InterpreterCoreNoEventGarbageCollector() { + queue_.reset(nullptr); +} + +void InterpreterCoreNoEventGarbageCollector::Add(Variable* var, + const Instruction& instr) { + Add(var, &instr.DeviceContext()); +} + +void InterpreterCoreNoEventGarbageCollector::Add( + Variable* var, const platform::DeviceContext* ctx) { + if (UNLIKELY(max_memory_size_ < 0) || var == nullptr) { + return; + } + + if (var->IsType()) { + Add(var->GetMutable()->MoveMemoryHolder(), ctx); + } else if (var->IsType< + operators::reader:: + OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) { + // TODO(xiongkun03) in old executor, this type of variable is not support + // eager deletion. so we just leave it here ? + } else if (var->IsType()) { + // TODO(xiongkun03) in old executor, this type of variable is not support + // eager deletion. so we just leave it here ? + } else if (var->IsType()) { + Add(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder(), + ctx); + var->GetMutable()->mutable_rows()->clear(); + } else if (var->IsType()) { + auto* tensor_arr = var->GetMutable(); + for (auto& t : *tensor_arr) { + Add(t.MoveMemoryHolder(), ctx); + } + } else if (var->IsType>()) { + // NOTE(@xiongkun03) conditional_op / while_op will create a STEP_SCOPE + // refer to executor.cc to see what old garbage collector does. + // do nothing, because the sub scope will be deleted by sub-executor. + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "The variable(%s) is not supported in eager deletion.", + framework::ToTypeName(var->Type()))); + } +} + +void InterpreterCoreNoEventGarbageCollector::Add( + Garbage garbage, const platform::DeviceContext* ctx) { + if (!garbage) { + return; + } + if (max_memory_size_ <= 1) { + queue_->AddTask([container = garbage, ctx = ctx]() { ctx->Wait(); }); + } else { + // lock guard + std::lock_guard guard(spinlock_); + cur_memory_size_ += garbage->size(); + garbages_->emplace_back(std::move(garbage)); + ctxs_.insert(ctx); + + if (cur_memory_size_ >= max_memory_size_) { + cur_memory_size_ = 0; + queue_->AddTask( + [container = std::move(*garbages_), dev_ctxs = std::move(ctxs_)]() { + for (auto& ctx : dev_ctxs) { + ctx->Wait(); + } + }); + ctxs_.clear(); + garbages_->clear(); + } + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h new file mode 100644 index 0000000000000..36c8adec367ad --- /dev/null +++ b/paddle/fluid/framework/new_executor/garbage_collector/no_event_garbage_collector.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h" +#include "paddle/fluid/framework/new_executor/workqueue/workqueue.h" + +namespace paddle { +namespace framework { + +class InterpreterCoreNoEventGarbageCollector + : public InterpreterCoreGarbageCollector { + public: + InterpreterCoreNoEventGarbageCollector(); + ~InterpreterCoreNoEventGarbageCollector(); + void Add(Variable* var, const Instruction& instr) override; + + private: + void Add(Variable* var, const platform::DeviceContext* ctx); + void Add(Garbage garbage, const platform::DeviceContext* ctx); + std::unique_ptr queue_; + std::unordered_set ctxs_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 1a98b44729ff8..66e8f93736a5b 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -18,8 +18,6 @@ #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" -#include "paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h" -#include "paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.h" #include "paddle/fluid/framework/new_executor/interpretercore_util.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/os_info.h" @@ -30,6 +28,7 @@ #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/fluid/platform/device/gpu/gpu_info.h" PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true, @@ -41,7 +40,6 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, DECLARE_bool(check_nan_inf); DECLARE_bool(benchmark); -DECLARE_bool(fast_eager_deletion_mode); constexpr const char* kExceptionCaught = "ExceptionCaught"; constexpr const char* kTaskCompletion = "TaskCompletion"; @@ -52,12 +50,6 @@ namespace framework { static constexpr size_t kHostNumThreads = 4; static constexpr size_t kDeviceNumThreads = 1; -bool IsInterpretercoreFastGCEnabled() { - return memory::allocation::AllocatorFacade::Instance() - .IsStreamSafeCUDAAllocatorUsed() && - FLAGS_fast_eager_deletion_mode; -} - InterpreterCore::InterpreterCore(const platform::Place& place, const BlockDesc& block, const std::set& skip_gc_vars, @@ -71,16 +63,6 @@ InterpreterCore::InterpreterCore(const platform::Place& place, is_build_ = false; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (IsInterpretercoreFastGCEnabled()) { - gc_ = std::make_unique(); - } else { - gc_ = std::make_unique(); - } -#else - gc_ = std::make_unique(); -#endif - exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught); completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion); @@ -117,6 +99,11 @@ InterpreterCore::~InterpreterCore() { interpreter::CostInfo InterpreterCore::DryRun( const std::vector& feed_names, const std::vector& feed_tensors) { +#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) + if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(place_.device); + } +#endif Prepare(feed_names, feed_tensors, true); interpreter::CostInfo cost_info; { @@ -141,6 +128,11 @@ interpreter::CostInfo InterpreterCore::DryRun( paddle::framework::FetchList InterpreterCore::Run( const std::vector& feed_names, const std::vector& feed_tensors) { +#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) + if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(place_.device); + } +#endif #ifdef PADDLE_WITH_MKLDNN platform::AttachPointerHashToMKLDNNKey(this, place_); #endif @@ -172,6 +164,11 @@ paddle::framework::FetchList InterpreterCore::Run( paddle::framework::FetchList InterpreterCore::Run( const std::vector& feed_names) { +#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) + if (platform::is_gpu_place(place_)) { + platform::SetDeviceId(place_.device); + } +#endif #ifdef PADDLE_WITH_MKLDNN platform::AttachPointerHashToMKLDNNKey(this, place_); #endif @@ -498,16 +495,7 @@ void InterpreterCore::Convert( } BuildSkipShareLoDInfo(); - - for (size_t i = 0; i < vec_instruction_.size(); ++i) { -#ifdef PADDLE_WITH_IPU - gc_event_.emplace_back(phi::CPUPlace(), 0); -#else - gc_event_.emplace_back(vec_instruction_[i].DeviceContext().GetPlace(), - platform::GenerateDeviceEventFlag()); - -#endif - } + gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_); bool inplaced = false; for (auto inst : vec_instruction_) { if (inst.OpBase()->Type() == "share_buffer" || @@ -828,9 +816,6 @@ void InterpreterCore::RunInstructionAsync( RunInstruction(instr_node); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - RecordStreamForGC(instr_node); -#endif CheckGC(instr_node, atomic_var_ref); interpreter::RecordEvent(instr_node, place_); @@ -969,7 +954,9 @@ void InterpreterCore::CheckGC( std::vector>* atomic_var_ref) { platform::RecordEvent record( "CheckGC", platform::TracerEventType::UserDefined, 10); - size_t instr_id = instr.Id(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + RecordStreamForGC(instr); +#endif auto& var_scope = var_scope_; for (auto var_id : instr.GCCheckVars()) { @@ -986,23 +973,7 @@ void InterpreterCore::CheckGC( if (is_ready) { VLOG(6) << "Async delete variable with name : " << var_scope.GetNameById(var_id); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (IsInterpretercoreFastGCEnabled()) { - static_cast(gc_.get())->Add( - var_scope_.VarRef(var_id)); - - } else { - static_cast(gc_.get())->Add( - var_scope_.VarRef(var_id), - &gc_event_.at(instr_id), - &instr.DeviceContext()); - } -#else - static_cast(gc_.get())->Add( - var_scope_.VarRef(var_id), - &gc_event_.at(instr_id), - &instr.DeviceContext()); -#endif + gc_->Add(var_scope_.VarRef(var_id), instr); } } } diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index 7069be5af160f..a7efa1349e8f1 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -141,7 +141,6 @@ class InterpreterCore { std::shared_ptr completion_notifier_{nullptr}; std::unique_ptr gc_; - std::vector gc_event_; std::future> atomic_deps_; std::future> atomic_var_ref_; diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 52ac86d060694..204b2c87544de 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -425,6 +425,9 @@ OpDesc::OpDesc(const OpDesc &other, BlockDesc *block) { CopyFrom(other); block_ = block; need_update_ = true; + for (auto &iter : attrs_) { + UpdateVarAttr(iter.first, iter.second); + } } void OpDesc::CopyFrom(const OpDesc &op_desc) { @@ -465,9 +468,13 @@ OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block) for (const proto::OpDesc::Attr &attr : desc_.attrs()) { std::string attr_name = attr.name(); // The sub_block referred to by the BLOCK attr hasn't been added - // to ProgramDesc class yet, we skip setting BLOCK/BLOCKS attr here. - if (attr.type() != proto::AttrType::BLOCK && - attr.type() != proto::AttrType::BLOCKS) { + // to ProgramDesc class yet, we skip setting BLOCK/BLOCKS/VAR/VARS attr + // here. + auto attr_type = attr.type(); + if (attr_type != proto::AttrType::BLOCK && + attr_type != proto::AttrType::BLOCKS && + attr_type != proto::AttrType::VAR && + attr_type != proto::AttrType::VARS) { attrs_[attr_name] = GetAttrValue(attr); } } @@ -489,9 +496,31 @@ const std::vector &OpDesc::Input(const std::string &name) const { return it->second; } -std::vector OpDesc::InputArgumentNames() const { +std::vector OpDesc::Input(const std::string &name, + bool with_attr_var) const { + // Attribute with VarDesc type will consider as Input + if (with_attr_var) { + auto it = attrs_.find(name); + if (it != attrs_.end() && HasAttrVar(it->second)) + return AttrVarNames(it->second); + } + return this->Input(name); +} + +VariableNameMap OpDesc::Inputs(bool with_attr_var) const { + if (!with_attr_var) { + return inputs_; + } + VariableNameMap res = inputs_; + for (auto &attr : FilterAttrVar(attrs_)) { + res[attr.first] = AttrVarNames(attr.second); + } + return res; +} + +std::vector OpDesc::InputArgumentNames(bool with_attr_var) const { std::vector retv; - for (auto &ipt : this->inputs_) { + for (auto &ipt : this->Inputs(with_attr_var)) { retv.insert(retv.end(), ipt.second.begin(), ipt.second.end()); } return retv; @@ -558,24 +587,31 @@ bool OpDesc::HasProtoAttr(const std::string &name) const { return false; } -proto::AttrType OpDesc::GetAttrType(const std::string &name) const { - auto it = attrs_.find(name); - PADDLE_ENFORCE_NE( - it, - attrs_.end(), - platform::errors::NotFound("Attribute %s is not found.", name)); - return static_cast(it->second.index() - 1); +proto::AttrType OpDesc::GetAttrType(const std::string &name, + bool with_attr_var) const { + auto attr = this->GetAttr(name, with_attr_var); + return static_cast(attr.index() - 1); } -std::vector OpDesc::AttrNames() const { +std::vector OpDesc::AttrNames(bool with_attr_var) const { std::vector retv; retv.reserve(attrs_.size()); for (auto &attr : attrs_) { + if (!with_attr_var && HasAttrVar(attr.second)) continue; retv.push_back(attr.first); } return retv; } +bool OpDesc::HasAttr(const std::string &name, bool with_attr_var) const { + auto iter = attrs_.find(name); + bool is_found = iter != attrs_.end(); + if (with_attr_var) { + return is_found; + } + return is_found && !HasAttrVar(iter->second); +} + void OpDesc::RemoveAttr(const std::string &name) { attrs_.erase(name); need_update_ = true; @@ -647,6 +683,16 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) { need_update_ = true; } +void OpDesc::SetVarAttr(const std::string &name, VarDesc *var) { + this->attrs_[name] = var; + need_update_ = true; +} + +void OpDesc::SetVarsAttr(const std::string &name, std::vector vars) { + this->attrs_[name] = vars; + need_update_ = true; +} + void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) { this->attrs_[name] = block; need_update_ = true; @@ -664,12 +710,18 @@ void OpDesc::SetAttrMap( need_update_ = true; } -Attribute OpDesc::GetAttr(const std::string &name) const { +Attribute OpDesc::GetAttr(const std::string &name, bool with_attr_var) const { auto it = attrs_.find(name); PADDLE_ENFORCE_NE( it, attrs_.end(), platform::errors::NotFound("Attribute %s is not found.", name)); + if (!with_attr_var) { + PADDLE_ENFORCE_EQ( + HasAttrVar(it->second), + false, + platform::errors::NotFound("Attribute %s is not found.", name)); + } return it->second; } @@ -790,6 +842,19 @@ struct SetAttrDescVisitor { void operator()(const std::vector &v) const { VectorToRepeated(v, attr_->mutable_bools()); } + + void operator()(const std::vector &v) const { + std::vector var_names; + for (auto var : v) { + var_names.emplace_back(var->Name()); + } + VectorToRepeated(var_names, attr_->mutable_vars_name()); + } + + void operator()(const VarDesc *desc) const { + attr_->set_var_name(desc->Name()); + } + void operator()(const std::vector &v) const { std::vector blocks_idx; for (auto blk : v) { @@ -818,6 +883,8 @@ struct SetAttrDescVisitor { }; void OpDesc::Flush() { + VLOG(4) << "Flush " + << " " << Type() << " " << need_update_; if (need_update_) { this->desc_.mutable_inputs()->Clear(); for (auto &ipt : inputs_) { @@ -866,12 +933,7 @@ void OpDesc::InferShape(const BlockDesc &block) { try { VLOG(3) << "CompileTime infer shape on " << Type(); auto &op_info = OpInfoMap::Instance().Get(this->Type()); - auto *checker = op_info.Checker(); - if (checker != nullptr) { - // set dafault value here - VLOG(10) << "begin to check attribute of " << Type(); - checker->Check(&attrs_); - } + this->CheckAttrs(); auto &infer_shape = op_info.infer_shape_; PADDLE_ENFORCE_EQ( static_cast(infer_shape), @@ -916,15 +978,62 @@ void OpDesc::InferVarType(BlockDesc *block) const { } } +void OpDesc::UpdateVarAttr(const std::string &name, const Attribute &attr) { + auto attr_type = static_cast(attr.index() - 1); + auto type = GetAttrType(name, true); + if (type == proto::AttrType::VAR) { + PADDLE_ENFORCE_EQ( + attr_type, + type, + platform::errors::InvalidArgument( + "Required attr.type == proto::AttrType::VAR, but received %s", + attr_type)); + auto *var_desc = PADDLE_GET_CONST(VarDesc *, attr); + VLOG(3) << "Update AttrVar " << name << " with " << var_desc->Name(); + attrs_[name] = FindVarRecursive(var_desc->Name()); + } else if (type == proto::AttrType::VARS) { + PADDLE_ENFORCE_EQ( + attr_type, + type, + platform::errors::InvalidArgument( + "Required attr.type == proto::AttrType::VARS, but received %s", + attr_type)); + auto vars_desc = PADDLE_GET_CONST(std::vector, attr); + std::vector new_val; + for (auto &var_desc : vars_desc) { + VLOG(3) << "Update AttrVars " << name << " with " << var_desc->Name(); + new_val.emplace_back(FindVarRecursive(var_desc->Name())); + } + attrs_[name] = std::move(new_val); + } +} + +VarDesc *OpDesc::FindVarRecursive(const std::string &name) { + auto *cur_block = block_; + while (cur_block != nullptr && cur_block->ID() >= 0) { + auto *var = block_->FindVar(name); + if (var != nullptr) { + return var; + } + cur_block = cur_block->ParentBlock(); + } + PADDLE_THROW(platform::errors::NotFound( + "Not found Var(%s) from Block(%d) back into global Block.", + name, + block_->ID())); +} + CompileTimeInferShapeContext::CompileTimeInferShapeContext( const OpDesc &op, const BlockDesc &block) : op_(op), block_(block) {} bool CompileTimeInferShapeContext::HasInput(const std::string &name) const { - if (op_.Inputs().find(name) == op_.Inputs().end()) { + auto inputs = op_.Inputs(/*with_attr_var=*/true); + if (inputs.find(name) == inputs.end()) { return false; } - const std::vector &input_names = op_.Input(name); + const std::vector &input_names = + op_.Input(name, /*with_attr_var=*/true); auto length = input_names.size(); if (length == 0) { return false; @@ -959,14 +1068,16 @@ bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const { } bool CompileTimeInferShapeContext::HasAttr(const std::string &name) const { - return op_.HasAttr(name); + return op_.HasAttr(name, /*with_attr_var=*/false); } bool CompileTimeInferShapeContext::HasInputs(const std::string &name) const { - if (op_.Inputs().find(name) == op_.Inputs().end()) { + auto inputs = op_.Inputs(/*with_attr_var=*/true); + if (inputs.find(name) == inputs.end()) { return false; } - const std::vector &input_names = op_.Input(name); + const std::vector &input_names = + op_.Input(name, /*with_attr_var=*/true); if (input_names.empty()) { return false; } @@ -1004,7 +1115,7 @@ AttrReader CompileTimeInferShapeContext::Attrs() const { std::vector CompileTimeInferShapeContext::Inputs( const std::string &name) const { - return op_.Input(name); + return op_.Input(name, /*with_attr_var=*/true); } std::vector CompileTimeInferShapeContext::Outputs( @@ -1054,5 +1165,21 @@ proto::VarType::Type CompileTimeInferShapeContext::GetVarType( return block_.FindVarRecursive(name)->GetType(); } +std::vector AttrVarNames(const Attribute &attr) { + std::vector vars_name; + if (IsAttrVar(attr)) { + vars_name.emplace_back(PADDLE_GET_CONST(VarDesc *, attr)->Name()); + } else if (IsAttrVars(attr)) { + for (auto &iter : PADDLE_GET_CONST(std::vector, attr)) { + vars_name.emplace_back(iter->Name()); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported Attribute value type `%s` for AttrVarNames", + platform::demangle(attr.type().name()))); + } + return vars_name; +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 02186a02e3d83..a1f264a849dcd 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -27,6 +27,7 @@ limitations under the License. */ namespace paddle { namespace framework { +class VarDesc; class BlockDesc; class ProgramDesc; @@ -55,7 +56,10 @@ class OpDesc { const std::vector &Input(const std::string &name) const; - std::vector InputArgumentNames() const; + std::vector Input(const std::string &name, + bool with_attr_var) const; + + std::vector InputArgumentNames(bool with_attr_var = false) const; void SetInput(const std::string ¶m_name, const std::vector &args); @@ -72,24 +76,27 @@ class OpDesc { void RemoveInput(const std::string &name); - bool HasAttr(const std::string &name) const { - return attrs_.find(name) != attrs_.end(); - } + bool HasAttr(const std::string &name, bool with_attr_var = false) const; bool HasProtoAttr(const std::string &name) const; - proto::AttrType GetAttrType(const std::string &name) const; + proto::AttrType GetAttrType(const std::string &name, + bool with_attr_var = false) const; - std::vector AttrNames() const; + std::vector AttrNames(bool with_attr_var = false) const; void SetAttr(const std::string &name, const Attribute &v); void RemoveAttr(const std::string &name); + void SetVarAttr(const std::string &name, VarDesc *var); + + void SetVarsAttr(const std::string &name, std::vector vars); + void SetBlockAttr(const std::string &name, BlockDesc *block); void SetBlocksAttr(const std::string &name, std::vector blocks); - Attribute GetAttr(const std::string &name) const; + Attribute GetAttr(const std::string &name, bool with_attr_var = false) const; template T GetAttrIfExists(const std::string &name) const { @@ -120,11 +127,15 @@ class OpDesc { // Only be used in C++ void SetAttrMap(const AttributeMap &attr_map); - std::vector InputNames() const { return MapKeys(inputs_); } + std::vector InputNames(bool with_attr_var = false) const { + return MapKeys(inputs_); + } std::vector OutputNames() const { return MapKeys(outputs_); } const VariableNameMap &Inputs() const { return inputs_; } + VariableNameMap Inputs(bool with_attr_var) const; + const VariableNameMap &Outputs() const { return outputs_; } VariableNameMap *MutableInputs() { @@ -156,12 +167,20 @@ class OpDesc { const BlockDesc *Block() const { return this->block_; } + void UpdateVarAttr(const std::string &name, const Attribute &attr); + // The Id() and OrignalId() are only used for auto parallel. uint64_t Id() const { return id_; } uint64_t OriginalId() const { return original_id_; } void SetOriginalId(uint64_t original_id) { original_id_ = original_id; } + bool NeedUpdate() const { return need_update_; } + private: + friend class ProgramDesc; + // Find VarDesc from OpDesc located Block into global Block + VarDesc *FindVarRecursive(const std::string &name); + template static std::vector MapKeys(const MapType &map) { std::vector ret_val; @@ -181,13 +200,14 @@ class OpDesc { // Must start from one return ++uid; } - + // it it really needed? or just mantain a ptr from block? proto::OpDesc desc_; BlockDesc *block_{nullptr}; // not_own // input arg name => input variable names VariableNameMap inputs_; // output arg name => output variable names VariableNameMap outputs_; + // attribute name => all original attrs AttributeMap attrs_; // need_update_ indicate there some local changes not be synchronized. If @@ -202,5 +222,7 @@ class OpDesc { // current OpDesc is not built from the other one. uint64_t original_id_ = id_; }; + +std::vector AttrVarNames(const Attribute &attr); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 51aeed2e5d734..9cea78c92c6a0 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -96,12 +96,10 @@ class OpProtoAndCheckerMaker { template TypedAttrChecker &AddAttr(const std::string &name, - const std::string &comment, - bool generated = false) { + const std::string &comment) { auto *attr = proto_->add_attrs(); attr->set_name(name); attr->set_comment(comment); - attr->set_generated(generated); attr->set_type(AttrTypeID()); return op_checker_->AddAttrChecker(name, attr); } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 6c85cee0b049d..c2a665126767c 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/details/nan_inf_utils.h" +#include "paddle/fluid/framework/inference_cached_ops.h" #include "paddle/fluid/framework/op_call_stack.h" #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/shape_inference.h" @@ -447,6 +448,13 @@ OperatorBase::OperatorBase(const std::string& type, GenerateTemporaryNames(); CheckAllInputOutputSet(); } + // In OperatorBase level, all attribute with VarDesc type will be considered + // as Input. + for (auto& attr : FilterAttrVar(attrs)) { + VLOG(3) << "found Attribute with Variable type: " << attr.first; + inputs_[attr.first] = std::move(AttrVarNames(attr.second)); + attrs_.erase(attr.first); + } } std::vector OperatorBase::InputVars() const { @@ -702,6 +710,12 @@ class RuntimeInferShapeContext : public InferShapeContext { return in[0] != nullptr; } + size_t InputsSize() const { + auto& op_proto = + paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_; + return op_proto->inputs().size(); + } + bool HasOutput(const std::string& name) const override { // has only one output const auto& outs = ctx_.outputs; @@ -1193,7 +1207,86 @@ struct OperatorWithKernel::CacheImpl { return infer_shape_ctx_.get(); } + bool updateInputsShapesDimCache() { + bool flag = false; + size_t inputs_size = + std::min(kernel_ctx_->InputsSize(), infer_shape_ctx_->InputsSize()); + for (size_t i = 0; i < inputs_size; i++) { + const std::string& in_name = infer_shape_ctx_->GetInputNameByIdx(i); + if (!infer_shape_ctx_->HasInputs(in_name)) continue; + if (!inputs_dim_caches.count(in_name) || + infer_shape_ctx_->GetInputsDim(in_name) != + inputs_dim_caches[in_name]) { + inputs_dim_caches[in_name] = infer_shape_ctx_->GetInputsDim(in_name); + flag = true; + } + } + +#if defined(PADDLE_WITH_CUDA) + if (flag) discardCudaGraphCache(); +#endif + return flag; + } + + bool cudaGraphEnabled(bool need_prepare_data, + bool need_prepare_phi_data, + const std::string& op_type) const { +#if defined(PADDLE_WITH_CUDA) + return std::count(cached_gpu_ops.begin(), cached_gpu_ops.end(), op_type) && + !need_prepare_data && !need_prepare_phi_data; +#else + return false; +#endif + } + + bool cacheEnabled(bool run_phi_kernel, + bool need_prepare_data, + bool need_prepare_phi_data, + const std::string& op_type) const { +#if defined(PADDLE_WITH_CUDA) + if (cudaGraphEnabled(need_prepare_data, need_prepare_phi_data, op_type)) + return true; +#endif + return (run_phi_kernel && !need_prepare_data && !need_prepare_phi_data); + } + +#if defined(PADDLE_WITH_CUDA) + void startCudaGraphCapture() { + phi::GPUContext* ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + auto stream = ctx->stream(); + cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); + } + + void endCudaGraphCapture() { + phi::GPUContext* ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + auto stream = ctx->stream(); + + cudaGraph_t graph_; + cudaStreamEndCapture(stream, &graph_); + cudaGraphInstantiate(&graph_instance_, graph_, NULL, NULL, 0); + graph_generated = true; + } + + void runCudaGraph() { + phi::GPUContext* ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + auto stream = ctx->stream(); + cudaGraphLaunch(graph_instance_, stream); + } + + bool cudaGraphGenerated() { return graph_generated; } + + void discardCudaGraphCache() { graph_generated = false; } + + private: + bool graph_generated{false}; + cudaGraphExec_t graph_instance_; +#endif + private: + std::map> inputs_dim_caches; std::unique_ptr kernel_ctx_; std::unique_ptr infer_shape_ctx_; }; @@ -1403,8 +1496,74 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, this->Info().infer_shape_(&infer_shape_ctx); } +void OperatorWithKernel::InitOpCache(const Scope& scope, + const platform::Place& place) const { + if (runtime_ctx_.get() == nullptr || pre_scope_ != &scope) { + std::lock_guard lock(cache_update_mutex_); + if (runtime_ctx_.get() == nullptr || pre_scope_ != &scope) { + runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); + pre_scope_ = &scope; + } + } + + impl_ = + new CacheImpl(new phi::KernelContext(), + new RuntimeInferShapeContext(*this, *runtime_ctx_.get())); + + RunImpl(scope, place, runtime_ctx_.get()); + if (impl_->cacheEnabled(run_phi_kernel_, + need_prepare_data_, + need_prepare_phi_data_, + Type())) { + impl_->updateInputsShapesDimCache(); + } +} + void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { + // function name: runOpCache() + // effect: reuse cacheImpl to accelerate inference period + auto runOpCache = [&]() { +#if defined(PADDLE_WITH_CUDA) + if (impl_->cudaGraphEnabled( + need_prepare_data_, need_prepare_phi_data_, Type())) { + // cudaGraph cache + if (impl_->updateInputsShapesDimCache()) { + if (!all_kernels_must_compute_runtime_shape_) + this->Info().infer_shape_(impl_->getRuntimeInferShapeContext()); + (*phi_kernel_)(impl_->getKernelContext()); + } else if (!impl_->cudaGraphGenerated()) { + impl_->startCudaGraphCapture(); + impl_->getKernelContext(); + RunImpl(scope, place, runtime_ctx_.get()); + impl_->endCudaGraphCapture(); + } else { + if (!all_kernels_must_compute_runtime_shape_) + this->Info().infer_shape_(impl_->getRuntimeInferShapeContext()); + impl_->runCudaGraph(); + } + return; + } +#endif + // common cache + if (!all_kernels_must_compute_runtime_shape_) + this->Info().infer_shape_(impl_->getRuntimeInferShapeContext()); + (*phi_kernel_)(impl_->getKernelContext()); + }; + + // function name: updateRuntimeContext + // effect: update runtime_ctx from current scope. + auto updateRuntimeContext = [&](const Scope& scope) { + const Scope* cur_scope = &scope; + if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { + std::lock_guard lock(cache_update_mutex_); + if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { + runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); + pre_scope_ = cur_scope; + } + } + }; + // To reduce the elapsed time of HasAttr, we use bool variable to record the // result of HasAttr. if (!enable_cache_runtime_context_ && HasAttr(kEnableCacheRuntimeContext)) @@ -1417,20 +1576,18 @@ void OperatorWithKernel::RunImpl(const Scope& scope, RuntimeContext ctx(Inputs(), Outputs(), scope); RunImpl(scope, place, &ctx); pre_scope_ = cur_scope; - } else if (run_phi_kernel_ && impl_ != nullptr && !need_prepare_data_ && - !need_prepare_phi_data_) { - if (!all_kernels_must_compute_runtime_shape_) - this->Info().infer_shape_(impl_->getRuntimeInferShapeContext()); - (*phi_kernel_)(impl_->getKernelContext()); } else { - if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { - std::lock_guard lock(cache_update_mutex_); - if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { - runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); - pre_scope_ = cur_scope; - } + if (!impl_) { + InitOpCache(scope, place); + } else if (impl_->cacheEnabled(run_phi_kernel_, + need_prepare_data_, + need_prepare_phi_data_, + Type())) { + runOpCache(); + } else { + updateRuntimeContext(scope); + RunImpl(scope, place, runtime_ctx_.get()); } - RunImpl(scope, place, runtime_ctx_.get()); } } @@ -1695,9 +1852,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, phi::KernelContext phi_kernel_context; if (enable_cache_runtime_context_ && !need_prepare_phi_data_ && !need_prepare_data_) { - impl_ = - new CacheImpl(new phi::KernelContext(), - new RuntimeInferShapeContext(*this, *runtime_ctx)); BuildPhiKernelContext(*runtime_ctx, dev_ctx, impl_->getKernelContext()); (*phi_kernel_)(impl_->getKernelContext()); } else { @@ -2200,7 +2354,9 @@ Scope* OperatorWithKernel::PrepareData( (in_def->backend != phi::Backend::GPUDNN || tensor_backend != phi::Backend::GPU) && (in_def->backend != phi::Backend::KPS || - tensor_backend != phi::Backend::XPU)) || + tensor_backend != phi::Backend::XPU) && + (in_def->backend != phi::Backend::ONEDNN || + tensor_backend != phi::Backend::CPU)) || tensor_in->place().GetType() == AllocationType::GPUPINNED) { new_expected_kernel_key = std::make_unique( expected_kernel_key.data_type_, @@ -2723,6 +2879,8 @@ void OperatorWithKernel::BuildPhiKernelContext( for (size_t i = 0; i < attr_names.size(); ++i) { VLOG(6) << "BuildPhiKernelContext: " << attr_names[i] << ": " << attr_defs[i].type_index; + // attribute with Variable type has been placed into Inputs(), and + // we can parse them from RuntimeContext.inputs. auto attr_iter = Attrs().find(attr_names[i]); switch (attr_defs[i].type_index) { case phi::AttributeType::SCALAR: diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 17ec9a1f93e72..4185b450c7a8c 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -712,6 +712,7 @@ class OperatorWithKernel : public OperatorBase { // used for IndicateOrPromoteVarDataTypes Tensor* GetTensorFormInputSafely(const ExecutionContext& ctx, const std::string& name) const; + void InitOpCache(const Scope& scope, const platform::Place& place) const; protected: mutable std::unique_ptr kernel_type_; diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc index 593646164940b..f0f35ea28cc53 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc @@ -62,6 +62,8 @@ const std::unordered_map> kDenyParamMap = {{"batch_norm", {"ReserveSpace"}}, {"batch_norm_grad", {"ReserveSpace"}}}; +const std::unordered_set kDefaultDenyOps = {"feed", "fetch"}; + std::unordered_set GetDenyVarNames(const GraphNodeSet& cluster) { std::unordered_set deny_var_set; @@ -560,22 +562,24 @@ void SearchAllSubgraphs(Graph* graph) { auto allow_ops = StringSplit(FLAGS_allow_cinn_ops, kDelim); auto deny_ops = StringSplit(FLAGS_deny_cinn_ops, kDelim); auto teller = [&allow_ops, &deny_ops](const Node* node) { + const auto& node_name = node->Name(); bool registered = ::cinn::frontend::OpMapperRegistry::Global()->Find( - node->Name()) != nullptr; + node_name) != nullptr; // if the op type is registered in CINN and allow_ops is not empty, return // true only when it is in allow_ops - if (allow_ops.size()) { - return registered && allow_ops.count(node->Name()); + if (!allow_ops.empty()) { + return registered && allow_ops.count(node_name); } // if the op type is registered in CINN and deny_ops is not empty, return // true only when it is not in deny_ops - if (deny_ops.size()) { - return registered && !deny_ops.count(node->Name()); + if (!deny_ops.empty()) { + return registered && !deny_ops.count(node_name); } // if the user doesn't set FLAGS_allow_cinn_ops and FLAGS_deny_cinn_ops, // return true only when it is registered in CINN - return registered && (node->IsOp() && !IsInplaceOp(*node->Op())); + return registered && !kDefaultDenyOps.count(node_name) && + (node->IsOp() && !IsInplaceOp(*node->Op())); }; VLOG(4) << "The allowed Cinn Ops: " << FLAGS_allow_cinn_ops; VLOG(4) << "The denied Cinn Ops: " << FLAGS_deny_cinn_ops; diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc index b184bc8be3681..1788119490b31 100644 --- a/paddle/fluid/framework/program_desc.cc +++ b/paddle/fluid/framework/program_desc.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" +#include #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/version.h" @@ -97,6 +98,23 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) { block_descs.push_back(MutableBlock(block_id)); } op->SetBlocksAttr(attr_name, block_descs); + } else if (op->GetAttrType(attr_name, true) == proto::AttrType::VAR) { + VarDesc *var_desc = + PADDLE_GET_CONST(VarDesc *, op->GetAttr(attr_name, true)); + op->SetVarAttr(attr_name, + o.Block(block_id).FindVarRecursive(var_desc->Name())); + } else if (op->GetAttrType(attr_name, true) == proto::AttrType::VARS) { + std::vector vars_desc = PADDLE_GET_CONST( + std::vector, op->GetAttr(attr_name, true)); + std::vector new_vars_desc; + std::transform( + vars_desc.begin(), + vars_desc.end(), + std::back_inserter(new_vars_desc), + [&](VarDesc *var_desc) { + return o.Block(block_id).FindVarRecursive(var_desc->Name()); + }); + op->SetVarsAttr(attr_name, new_vars_desc); } } } @@ -129,7 +147,21 @@ void ProgramDesc::InitFromProto() { for (auto &block : blocks_) { for (auto *op : block->AllOps()) { for (const auto &attr : op->Proto()->attrs()) { - if (attr.type() == proto::AttrType::BLOCK) { + if (attr.type() == proto::AttrType::VAR) { + std::string var_name = attr.var_name(); + VLOG(3) << "InitFromProto: SetVarAttr " << attr.name() << " from " + << var_name; + op->SetVarAttr(attr.name(), op->FindVarRecursive(var_name)); + } else if (attr.type() == proto::AttrType::VARS) { + auto vars_name = attr.vars_name(); + std::vector vars_desc; + for (auto &var_name : vars_name) { + VLOG(3) << "InitFromProto: SetVarsAttr " << attr.name() << " from " + << var_name; + vars_desc.emplace_back(op->FindVarRecursive(var_name)); + } + op->SetVarsAttr(attr.name(), vars_desc); + } else if (attr.type() == proto::AttrType::BLOCK) { size_t blk_idx = attr.block_idx(); op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx)); } else if (attr.type() == proto::AttrType::BLOCKS) { @@ -217,5 +249,16 @@ void ProgramDesc::SetFetchHolderName(const std::string &fetch_holder_name) { fetch_holder->SetPersistable(true); } +bool ProgramDesc::NeedUpdate() const { + bool need = false; + for (auto &block : blocks_) { + if (block->NeedUpdate()) { + need = true; + break; + } + } + return need; +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h index 7e1c12f4ac5b1..86d347caf58df 100644 --- a/paddle/fluid/framework/program_desc.h +++ b/paddle/fluid/framework/program_desc.h @@ -85,6 +85,8 @@ class ProgramDesc { // This function is used to change or unify the fetch_holder variables' name. void SetFetchHolderName(const std::string &fetch_holder_name); + bool NeedUpdate() const; + private: void InitFromProto(); diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc index ede6a99c43678..ceb45a83711ea 100644 --- a/paddle/fluid/framework/prune.cc +++ b/paddle/fluid/framework/prune.cc @@ -180,6 +180,9 @@ void prune_impl(const proto::ProgramDesc& input, std::map* pruned_origin_block_id_map) { auto& block = input.blocks(block_id); auto& ops = block.ops(); + auto add_dependent_var = [&](const std::string& name) { + if (feed_var_names.count(name) == 0) dependent_vars->insert(name); + }; bool expect_feed = true; for (auto& op_desc : ops) { @@ -245,8 +248,17 @@ void prune_impl(const proto::ProgramDesc& input, // For eval / infer mode, there is no optimize op in program. for (auto& var : op_desc.inputs()) { for (auto& argu : var.arguments()) { - if (feed_var_names.count(argu) == 0) { - dependent_vars->insert(argu); + add_dependent_var(argu); + } + } + // NOTE(dev): All attibute with VarDesc type is considered as Input, + // so they shall be added into dependent_vars. + for (auto& attr : op_desc.attrs()) { + if (attr.type() == proto::AttrType::VAR) { + add_dependent_var(attr.var_name()); + } else if (attr.type() == proto::AttrType::VARS) { + for (auto& name : attr.vars_name()) { + add_dependent_var(name); } } } @@ -331,20 +343,30 @@ void prune_impl(const proto::ProgramDesc& input, } std::set var_names; + auto add_var_names = [&](const std::string& name) { + if (var_map.count(name) != 0) var_names.insert(name); + }; for (const auto& op : *op_field) { auto& input_field = op.inputs(); for (auto& input_var : input_field) { for (auto& arg : input_var.arguments()) { - if (var_map.count(arg) != 0) { - var_names.insert(arg); - } + add_var_names(arg); } } auto& output_field = op.outputs(); for (auto& output_var : output_field) { for (auto& arg : output_var.arguments()) { - if (var_map.count(arg) != 0) { - var_names.insert(arg); + add_var_names(arg); + } + } + // NOTE(dev): All attibute with VarDesc type is considered as Input, + // so they shall be added into dependent_vars. + for (auto& attr : op.attrs()) { + if (attr.type() == proto::AttrType::VAR) { + add_var_names(attr.var_name()); + } else if (attr.type() == proto::AttrType::VARS) { + for (auto& name : attr.vars_name()) { + add_var_names(name); } } } diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 8b79d4079118f..fcb061aa93288 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -14,63 +14,15 @@ limitations under the License. */ #pragma once -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_layout.h" -#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/mixed_vector.h" -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/dense_tensor.h" namespace paddle { - namespace framework { using LoD = std::vector>; - -/* - NOTE(liym27): [ What is TensorInplaceVersion used for? ] - - TensorInplaceVersion is a version counter and every Tensor has a version - counter. It's used to check whether an inplace operation will result in an - incorrect gradient calculation. Version is incremented when the data of the - Variable is modified in place. - - - Question: In what scenarios will version counters be shared? - - Answer: When two Variables/VarBases share the same C++ Tensor(its Allocation - may change), both of them share the same version counter. For examples: - 1. `z = paddle.assign(input=x, output=y)`, `z` shares the same version counter - of `y` because z and y is the same VarBase; - 2. `y = x.detach()`, `y` shares the same version counter of `x`. - - - Question: In what scenarios will version counters NOT be shared? - - Answer: Replacing a `Variable`'s data by calling `Tensor::ShareDataWith(...)` - or `Tensor::ShareBufferWith(...)`. Because they share the same Allocation but - not framework::Tensor. - - - Question: Why put the inplace_version_counter_ in framework::Tensor instead - of Allocation or Variable? - - Answer: - 1. Tensor can call ResetHolder() to reset the corresponding Allocation so that - the inplace_version_counter_ changes if it's in Allocation, which will lead to - confusing information about inplace version. - 2. If inplace_version_counter_ is in Variable, different VariableWrappers - should be able to share the same Variable. However, a VariableWrapper hold a - Variable object but not a pointer. -*/ - using Tensor = phi::DenseTensor; } // namespace framework } // namespace paddle - -#include "paddle/fluid/framework/tensor_impl.h" diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index 05dd41eb6ffc5..fcf255dafc2e0 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" #include diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index b1bba0f7c35f8..3c9d1284cefdb 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -34,6 +34,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/mlu/device_context.h" #endif +#include "paddle/fluid/memory/memory.h" #include "paddle/phi/core/dense_tensor.h" namespace paddle { @@ -580,6 +581,26 @@ inline void TensorToVector(const Tensor& src, std::vector* dst) { std::ostream& operator<<(std::ostream& os, const LoD& lod); +inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { + int rank = src.dims().size(); + PADDLE_ENFORCE_GE( + rank, + 2, + platform::errors::InvalidArgument( + "'ReshapeToMatrix()' is only used for flatten high rank " + "tensors to matrixs. The dimensions of Tensor must be " + "greater or equal than 2. " + "But received dimensions of Tensor is %d", + rank)); + if (rank == 2) { + return src; + } + Tensor res; + res.ShareDataWith(src); + res.Resize(phi::flatten_to_2d(src.dims(), num_col_dims)); + return res; +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 3bcad63f21a84..31a006914aca7 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -34,6 +34,7 @@ class OperatorBase; class OpDesc; class InferShapeContext; class InferVarTypeContext; +class VarDesc; class BlockDesc; class Variable; class InferNoNeedBufferVarsFN; @@ -55,7 +56,9 @@ using Attribute = paddle::variant, std::vector, - std::vector>; + std::vector, + VarDesc*, + std::vector>; using AttributeMap = std::unordered_map; #ifdef PADDLE_WITH_ASCEND_CL @@ -73,6 +76,8 @@ using NPUAttribute = paddle::variant, std::vector, std::vector, + VarDesc*, + std::vector, std::vector>>; using NPUAttributeMap = std::unordered_map; diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h index 624b297f67199..1072657b4afc3 100644 --- a/paddle/fluid/framework/var_desc.h +++ b/paddle/fluid/framework/var_desc.h @@ -65,9 +65,12 @@ class VarDesc { desc_.set_name(name); // TODO(paddle-dev): Why default to lodtensor. desc_.mutable_type()->set_type(proto::VarType::LOD_TENSOR); + need_updated_ = true; } - explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) {} + explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) { + // need_updated_ = true; + } // Explicitly implement the copy constructor for auto parallel VarDesc(const VarDesc &other) @@ -78,16 +81,23 @@ class VarDesc { desc_ = other.desc_; attrs_ = other.attrs_; original_id_ = other.original_id_; + need_updated_ = true; return *this; } - proto::VarDesc *Proto() { return &desc_; } + proto::VarDesc *Proto() { + return &desc_; + need_updated_ = true; + } const proto::VarDesc *Proto() const { return &desc_; } std::string Name() const { return desc_.name(); } - void SetName(std::string name) { desc_.set_name(name); } + void SetName(std::string name) { + desc_.set_name(name); + need_updated_ = true; + } void SetTensorDescNum(size_t num); @@ -126,15 +136,22 @@ class VarDesc { bool Persistable() const { return desc_.persistable(); } - void SetPersistable(bool persistable) { desc_.set_persistable(persistable); } + void SetPersistable(bool persistable) { + desc_.set_persistable(persistable); + need_updated_ = true; + } bool IsParameter() const { return desc_.is_parameter(); } void SetIsParameter(bool is_parameter) { desc_.set_is_parameter(is_parameter); + need_updated_ = true; } - void ClearIsParameter() { desc_.clear_is_parameter(); } + void ClearIsParameter() { + desc_.clear_is_parameter(); + need_updated_ = true; + } bool HasIsParameter() const { return desc_.has_is_parameter(); } @@ -142,9 +159,13 @@ class VarDesc { void SetStopGradient(bool stop_gradient) { desc_.set_stop_gradient(stop_gradient); + need_updated_ = true; } - void ClearStopGradient() { desc_.clear_stop_gradient(); } + void ClearStopGradient() { + desc_.clear_stop_gradient(); + need_updated_ = true; + } bool HasStopGradient() const { return desc_.has_stop_gradient(); } @@ -152,6 +173,7 @@ class VarDesc { void SetNeedCheckFeed(bool need_check_feed) { desc_.set_need_check_feed(need_check_feed); + need_updated_ = true; } bool HasAttr(const std::string &name) const { @@ -168,7 +190,13 @@ class VarDesc { // The Id() and OriginalId() are only used for auto parallel. uint64_t Id() const { return id_; } uint64_t OriginalId() const { return original_id_; } - void SetOriginalId(uint64_t original_id) { original_id_ = original_id; } + void SetOriginalId(uint64_t original_id) { + original_id_ = original_id; + need_updated_ = true; + } + + bool NeedUpdate() const { return need_updated_; } + void SetNeedUpdate(bool need) { need_updated_ = need; } private: const proto::VarType::TensorDesc &tensor_desc() const; @@ -183,9 +211,12 @@ class VarDesc { return ++uid; } + // it it really needed? or just mantain a ptr from block? proto::VarDesc desc_; AttributeMap attrs_; + bool need_updated_{false}; + // Note: the id_ is unique for all VarDesc (only for auto parallel). uint64_t id_ = GenerateId(); // Note: the orignal_id_ is used for referring to the original VarDesc diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 1c3165a4538a2..623a44ed75d1d 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -21,7 +21,7 @@ #include "paddle/fluid/imperative/parallel_context.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" -#ifdef PADDLE_WITH_XPU_BKCL +#ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/device/xpu/enforce_xpu.h" #endif #include "paddle/fluid/string/string_helper.h" diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc index e417d3ad2f23b..e35568eb50c9a 100644 --- a/paddle/fluid/imperative/tests/test_prepare_op.cc +++ b/paddle/fluid/imperative/tests/test_prepare_op.cc @@ -259,5 +259,5 @@ TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) { USE_OP_ITSELF(split); USE_OP_ITSELF(relu); #ifdef PADDLE_WITH_MKLDNN -USE_OP_DEVICE_KERNEL(relu, MKLDNN); +PD_DECLARE_KERNEL(relu, OneDNN, ALL_LAYOUT); #endif diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 717737749a96b..997022abde3f9 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -216,7 +216,7 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_use_dla, TensorRtUseDLA, bool); DECL_ARGUMENT_FIELD(tensorrt_dla_core, TensorRtDLACore, int); DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); - DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); + DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int64_t); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); DECL_ARGUMENT_FIELD(tensorrt_disabled_ops, TensorRtDisabledOPs, diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 3c04638003cdd..723a787722143 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -133,7 +133,8 @@ void IRPassManager::CreatePasses(Argument *argument, argument->bfloat16_enabled_op_types())); #endif } else if (pass_name == "tensorrt_subgraph_pass") { - pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); + pass->Set("workspace_size", + new int64_t(argument->tensorrt_workspace_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("min_subgraph_size", new int(argument->tensorrt_min_subgraph_size())); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 48c35052b76ac..799b25a5cd1ee 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -1,3 +1,4 @@ + // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -379,7 +380,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( op_desc->SetBlockAttr("sub_block", new_block); op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString()); op_desc->SetAttr("max_batch_size", max_batch_size); - op_desc->SetAttr("workspace_size", Get("workspace_size")); + op_desc->SetAttr("workspace_size", Get("workspace_size")); op_desc->SetAttr("gpu_id", Get("gpu_device_id")); op_desc->SetAttr("output_name_mapping", output_mapping); op_desc->SetAttr("origin_output_dims", renamed_output_dims); @@ -500,7 +501,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( inference::Singleton::Global() .Create(engine_key + std::to_string(predictor_id), max_batch_size, - Get("workspace_size"), + Get("workspace_size"), precision_mode, calibrator.get(), Get("gpu_device_id"), diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc index ca8ab8aa71ef6..efaf79d48b3f6 100644 --- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc +++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc @@ -14,7 +14,10 @@ #include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h" +#include +#include #include +#include #include #include "paddle/fluid/framework/block_desc.h" @@ -39,7 +42,106 @@ namespace analysis { namespace { -bool IsKernelSupportPrecision( +inline std::string SerializeParams(framework::Scope* scope, + const std::vector& params) { + std::ostringstream os; + phi::CPUContext ctx; + for (const auto& param : params) { + VLOG(3) << "Serialize param: " << param; + PADDLE_ENFORCE_NOT_NULL( + scope->FindVar(param), + platform::errors::NotFound("Block should already have a '%s' variable", + param)); + auto* tensor = scope->FindVar(param)->GetMutable(); + framework::SerializeToStream(os, *tensor, ctx); + } + return os.str(); +} + +inline void StrToBinary(const std::string& path, const std::string& str) { + std::ofstream file(path.c_str(), std::ios::binary); + file.write(str.c_str(), str.size()); + file.close(); +} +inline bool NodeVarHasDtype(framework::ir::Node* node) { + if (node->IsCtrlVar()) return false; + + if (node->IsVar() && + (node->Var()->GetType() == + paddle::framework::proto::VarType::SELECTED_ROWS || + node->Var()->GetType() == + paddle::framework::proto::VarType::LOD_TENSOR || + node->Var()->GetType() == + paddle::framework::proto::VarType::LOD_TENSOR_ARRAY || + node->Var()->GetType() == paddle::framework::proto::VarType::STRINGS || + node->Var()->GetType() == paddle::framework::proto::VarType::VOCAB)) { + return true; + } + + return false; +} +void SaveMixedModel(framework::ir::Graph* graph, + framework::Scope* scope, + framework::ProgramDesc* mixed_program_desc, + const std::string& mixed_model_file, + const std::string& mixed_params_file, + phi::DataType mixed_precision) { + paddle::CPUPlace place; + auto parameters = scope->LocalVarNames(); + std::sort(parameters.begin(), parameters.end()); + + std::unordered_set weights_should_be_fp32; + for (auto* node : graph->Nodes()) { + if (!(node->IsVar() && !node->IsCtrlVar())) continue; + if (NodeVarHasDtype(node)) { + if (node->Var()->Persistable() && + node->Var()->GetDataType() == + paddle::framework::proto::VarType::FP32) { + VLOG(2) << "weights keep to fp32: " << node->Name(); + weights_should_be_fp32.insert(node->Name()); + } + } + } + + for (const auto& param_name : parameters) { + auto* var = scope->FindLocalVar(param_name); + if (var->IsType() || + var->IsType()) { + auto* t = var->GetMutable(); + framework::Tensor mixed_tensor; + mixed_tensor.Resize(t->dims()); + auto* data = t->mutable_data(platform::CPUPlace()); + + if (mixed_precision == phi::DataType::FLOAT16 && + !weights_should_be_fp32.count(param_name)) { + mixed_tensor.set_type(paddle::experimental::DataType::FLOAT16); + auto* mixed_data = + mixed_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < t->numel(); i++) { + mixed_data[i] = static_cast(data[i]); + } + t->clear(); + paddle::framework::TensorCopySync(mixed_tensor, place, t); + } else if (mixed_precision == phi::DataType::BFLOAT16 && + !weights_should_be_fp32.count(param_name)) { + mixed_tensor.set_type(paddle::experimental::DataType::BFLOAT16); + auto* mixed_data = + mixed_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < t->numel(); i++) { + mixed_data[i] = static_cast(data[i]); + } + t->clear(); + paddle::framework::TensorCopySync(mixed_tensor, place, t); + } + } + } + + StrToBinary(mixed_model_file, + mixed_program_desc->Proto()->SerializeAsString()); + StrToBinary(mixed_params_file, SerializeParams(scope, parameters)); +} + +bool PhiKernelSupportPrecision( const std::string& op_type, phi::Backend backend, phi::DataType data_type, @@ -56,10 +158,23 @@ bool GpuKernelSupportPrecision( const std::string& op_type, phi::DataType data_type, phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) { - bool res = - IsKernelSupportPrecision(op_type, phi::Backend::GPU, data_type, layout); - res |= IsKernelSupportPrecision( - op_type, phi::Backend::GPUDNN, data_type, layout); + auto phi_op_type = phi::TransToPhiKernelName(op_type); + bool res = PhiKernelSupportPrecision( + phi_op_type, phi::Backend::GPU, data_type, layout); + res |= PhiKernelSupportPrecision( + phi_op_type, phi::Backend::GPUDNN, data_type, layout); + + if (!res) { + auto& all_kernels = OperatorWithKernel::AllOpKernels(); + auto it = all_kernels.find(op_type); + if (it != all_kernels.end()) { + for (auto& kern_pair : it->second) { + if (platform::is_gpu_place(kern_pair.first.place_)) { + res = true; + } + } + } + } return res; } @@ -90,30 +205,16 @@ bool OutShouldNotConvert(ir::Node* var_node) { return false; } - -// Get weight names which appear in multiple block (block 0 and block n). -std::unordered_set GetMultiBlockPersistableNames( - framework::ProgramDesc* program_desc) { - std::unordered_set special_weights; - size_t block_size = program_desc->Size(); - - std::unordered_set block_0_weights; - for (auto var : program_desc->Block(0).AllVars()) { - if (var->Persistable()) block_0_weights.insert(var->Name()); - } - - for (size_t i = 1; i < block_size; ++i) { - // std::cout << program_desc->MutableBlock(i)->Proto()->DebugString() << - // std::endl;; - auto all_ops = program_desc->Block(i).AllOps(); - for (auto op : all_ops) { - for (auto name : op->InputArgumentNames()) { - if (block_0_weights.count(name)) special_weights.insert(name); - } - } +void ProcessOutputNode(ir::Node* var_node, + framework::proto::VarType::Type to_type) { + if (!NodeVarHasDtype(var_node)) return; + auto* out_var = var_node->Var(); + if (out_var->GetDataType() == framework::proto::VarType::FP32) { + if (OutShouldNotConvert(var_node)) return; + out_var->SetDataType(to_type); } - - return special_weights; + VLOG(3) << " out_node name " << var_node->Name() << " data_type " + << out_var->GetDataType(); } // Just process special cases for weights conversion. @@ -143,21 +244,8 @@ bool WeightsShouldNotConvert(ir::Node* var_node) { } } - // If cur_op's next is condition_flow op, then cur op should be fp32. Note, we - // now only convert to mixed in block 0. - for (auto* op_node : op_nodes) { - for (auto var : op_node->outputs) { - for (auto next_op : var->outputs) { - if (next_op->Op()->HasAttr("sub_block")) { - return true; - } - } - } - } - return false; } - inline bool IsFloatVarType(framework::proto::VarType::Type type) { if (type == framework::proto::VarType::FP16 || type == framework::proto::VarType::FP32 || @@ -165,6 +253,56 @@ inline bool IsFloatVarType(framework::proto::VarType::Type type) { return true; return false; } +void ProcessInputNode( + bool support_precision, + framework::ir::Graph* graph, + ir::Node* in_node, + ir::Node* op_node, + int* suffix, + framework::BlockDesc* block_desc, + std::unordered_map* cast_map, + framework::proto::VarType::Type to_type, + bool is_main_block, + std::unordered_map* + vars_in_multi_block_map) { + if (!NodeVarHasDtype(in_node)) return; + auto* in_var = in_node->Var(); + auto in_var_type = in_var->GetDataType(); + if (!is_main_block && vars_in_multi_block_map->count(in_var->Name())) { + in_var_type = vars_in_multi_block_map->at(in_var->Name()); + } + if (support_precision) { + if (in_var->Persistable() && + in_var_type == framework::proto::VarType::FP32) { + if (WeightsShouldNotConvert(in_node)) return; + in_var->SetDataType(to_type); + } else if (!in_var->Persistable() && IsFloatVarType(in_var_type) && + in_var_type != to_type) { + AddCastOp(graph, + in_node, + op_node, + in_var_type, + to_type, + suffix, + block_desc, + cast_map); + } + } else { + if (!in_var->Persistable() && IsFloatVarType(in_var_type) && + in_var_type != to_type) { + AddCastOp(graph, + in_node, + op_node, + in_var_type, + to_type, + suffix, + block_desc, + cast_map); + } + } + VLOG(3) << " in_node name " << in_var->Name() << " data_type " + << in_var->GetDataType(); +} void ConvertAllFp64ToFp32(framework::ir::Graph* graph) { auto op_nodes = framework::ir::TopologySortOperations(*graph); @@ -239,6 +377,11 @@ void HandleSpecialOps(framework::OpDesc* op_desc) { static_cast(framework::proto::VarType::FP32)) op_desc->SetAttr("dtype", static_cast(framework::proto::VarType::FP16)); + } else if (op_desc->Type() == "fill_constant_batch_size_like") { + if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) == + static_cast(framework::proto::VarType::FP32)) + op_desc->SetAttr("dtype", + static_cast(framework::proto::VarType::FP16)); } } @@ -260,26 +403,47 @@ void FixCastAttr(framework::ir::Graph* graph) { } } -// If op's output var is condition flow op's input, then the op must be fp32 -// precision. -bool NextOpIncludesConditionFlowOp(framework::ir::Node* cur_op_node) { - auto cur_op_outs = cur_op_node->outputs; - for (auto out_var : cur_op_outs) { - for (auto next_op_node : out_var->outputs) { - if (next_op_node->Op()->HasAttr("sub_block")) { - return true; - } +void FindVarsInMultiBlock( + framework::ProgramDesc* program_desc, + std::unordered_map* + vars_in_multi_block_map) { + std::set vars_in_multi_block; + std::set main_block_var_names_set; + for (auto op : program_desc->Block(0).AllOps()) { + auto in_names = op->InputArgumentNames(); + main_block_var_names_set.insert(in_names.begin(), in_names.end()); + } + + for (size_t i = 1; i < program_desc->Size(); ++i) { + std::set block_var_names_set; + for (auto op : program_desc->Block(i).AllOps()) { + auto in_names = op->InputArgumentNames(); + block_var_names_set.insert(in_names.begin(), in_names.end()); } + + std::set_intersection( + main_block_var_names_set.begin(), + main_block_var_names_set.end(), + block_var_names_set.begin(), + block_var_names_set.end(), + std::inserter(vars_in_multi_block, vars_in_multi_block.begin())); + } + + for (auto name : vars_in_multi_block) { + vars_in_multi_block_map->emplace(name, framework::proto::VarType::FP32); } - return false; } -void ConvertTensorDtype(framework::ProgramDesc* program_desc, - framework::ir::Graph* graph, - const std::unordered_set& blacklist, - bool keep_io_types, - phi::Backend backend, - phi::DataType tensor_dtype) { +void ConvertTensorDtype( + framework::ProgramDesc* program_desc, + framework::ir::Graph* graph, + const std::unordered_set& blacklist, + bool keep_io_types, + phi::Backend backend, + phi::DataType tensor_dtype, + bool is_main_block, + std::unordered_map* + vars_in_multi_block_map) { framework::proto::VarType::Type to_type; if (tensor_dtype == phi::DataType::FLOAT16) { to_type = framework::proto::VarType::FP16; @@ -287,25 +451,27 @@ void ConvertTensorDtype(framework::ProgramDesc* program_desc, to_type = framework::proto::VarType::BF16; } else { PADDLE_THROW(paddle::platform::errors::InvalidArgument( - "mixed_precision currently not supported dtype %d, we now only support " + "mixed_precision currently not supported dtype %d, we now only " + "support " "fp16 and bf16.", static_cast(tensor_dtype))); } - auto weight_name_in_multi_block = GetMultiBlockPersistableNames(program_desc); + auto* block_desc = + framework::ir::TopologySortOperations(*graph)[0]->Op()->Block(); + int num_low_precision = 0; int suffix = 0; - framework::BlockDesc* block_desc{nullptr}; std::vector output_nodes; std::unordered_map cast_map; auto op_nodes = framework::ir::TopologySortOperations(*graph); for (auto* op_node : op_nodes) { if (!op_node->IsOp()) continue; auto op_type = op_node->Op()->Type(); - auto phi_op_type = phi::TransToPhiKernelName(op_type); + VLOG(3) << "-------------------- op_type " << op_type << ", phi_type " + << phi::TransToPhiKernelName(op_type); // 1. set input dtype. if (op_type == "feed") { - block_desc = op_node->Op()->Block(); auto feed_var = op_node->outputs[0]->Var(); if (!keep_io_types && feed_var->GetDataType() == framework::proto::VarType::FP32) { @@ -319,71 +485,73 @@ void ConvertTensorDtype(framework::ProgramDesc* program_desc, continue; } + else if (op_node->Op()->HasAttr("sub_block")) { // NOLINT + // sub_block op's output dtype should be same as input dtype, if have the + // same name. + std::unordered_map in_name_to_node; + for (auto* in : op_node->inputs) { + if (NodeVarHasDtype(in)) { + in_name_to_node[in->Name()] = in; + } + } + + for (auto out : op_node->outputs) { + if (NodeVarHasDtype(out)) { + if (in_name_to_node.count(out->Name())) + out->Var()->SetDataType( + in_name_to_node[out->Name()]->Var()->GetDataType()); + } + } + + continue; + } + // 2. if op support fp16/bf16 and not in blacklist. // - cast weight to fp16/bf16. // - add cast op if the input dtype is not fp16/bf16. // - set output dtype. - else if (blacklist.count(phi_op_type) == 0 && // NOLINT - !NextOpIncludesConditionFlowOp(op_node)) { + else if (blacklist.count(op_type) == 0) { // NOLINT bool support_precision = - OpSupportPrecision(phi_op_type, backend, tensor_dtype, blacklist); - VLOG(2) << "op_type " << op_type << ", phi_op_type " << phi_op_type - << " support low precision " << support_precision << ", " + OpSupportPrecision(op_type, backend, tensor_dtype, blacklist); + VLOG(2) << "op_type " << op_type << ", phi_op_type " + << phi::TransToPhiKernelName(op_type) << " support low precision " + << support_precision << ", " << reinterpret_cast(op_node->Op()->Block()); - for (auto in_node : op_node->inputs) { - if (weight_name_in_multi_block.count(in_node->Name())) - support_precision = false; - } - if (support_precision) { HandleSpecialOps(op_node->Op()); ++num_low_precision; auto inputs = op_node->inputs; + // Process inputs. for (auto* in_node : inputs) { - if (in_node->IsCtrlVar()) continue; - auto* in_var = in_node->Var(); - if (in_var->Persistable() && - in_var->GetDataType() == framework::proto::VarType::FP32) { - if (WeightsShouldNotConvert(in_node)) continue; - in_var->SetDataType(to_type); - } else if (!in_var->Persistable() && - IsFloatVarType(in_var->GetDataType()) && - in_var->GetDataType() != to_type) { - AddCastOp(graph, - in_node, - op_node, - in_var->GetDataType(), - to_type, - &suffix, - block_desc, - &cast_map); - } + ProcessInputNode(true, + graph, + in_node, + op_node, + &suffix, + block_desc, + &cast_map, + to_type, + is_main_block, + vars_in_multi_block_map); } + // Process outputs. for (auto* out_node : op_node->outputs) { - if (out_node->IsCtrlVar()) continue; - auto* out_var = out_node->Var(); - if (out_var->GetDataType() == framework::proto::VarType::FP32) { - if (OutShouldNotConvert(out_node)) continue; - out_var->SetDataType(to_type); - } + ProcessOutputNode(out_node, to_type); } } else { auto inputs = op_node->inputs; for (auto* in_node : inputs) { - if (in_node->IsCtrlVar()) continue; - auto* in_var = in_node->Var(); - if (!in_var->Persistable() && IsFloatVarType(in_var->GetDataType()) && - in_var->GetDataType() != framework::proto::VarType::FP32) { - AddCastOp(graph, - in_node, - op_node, - in_var->GetDataType(), - framework::proto::VarType::FP32, - &suffix, - block_desc, - &cast_map); - } + ProcessInputNode(false, + graph, + in_node, + op_node, + &suffix, + block_desc, + &cast_map, + framework::proto::VarType::FP32, + is_main_block, + vars_in_multi_block_map); } } } @@ -409,8 +577,8 @@ void ConvertTensorDtype(framework::ProgramDesc* program_desc, } } - // 4. if output_op's dtype is not compatible to output dtype, then just insert - // cast. + // 4. if output_op's dtype is not compatible to output dtype, then just + // insert cast. for (auto* node : output_nodes) { if (node->IsCtrlVar()) continue; auto var = node->Var(); @@ -438,22 +606,31 @@ void ConvertTensorDtype(framework::ProgramDesc* program_desc, } } + if (is_main_block) { + for (auto node : graph->Nodes()) { + if (vars_in_multi_block_map->count(node->Name())) { + vars_in_multi_block_map->at(node->Name()) = node->Var()->GetDataType(); + } + } + } + if (num_low_precision) LOG(INFO) << "--- detected " << num_low_precision << " low precision ops"; } } // namespace -bool OpSupportPrecision(const std::string& phi_op_type, +bool OpSupportPrecision(const std::string& op_type, phi::Backend backend, phi::DataType precision, const std::unordered_set& blacklist) { + auto phi_op_type = phi::TransToPhiKernelName(op_type); bool support_precision = false; - if (blacklist.count(phi_op_type) == 0) { + if (blacklist.count(op_type) == 0) { if (backend == phi::Backend::GPU) - support_precision = GpuKernelSupportPrecision(phi_op_type, precision); + support_precision = GpuKernelSupportPrecision(op_type, precision); else support_precision = - IsKernelSupportPrecision(phi_op_type, backend, precision); + PhiKernelSupportPrecision(phi_op_type, backend, precision); } return support_precision; } @@ -521,102 +698,41 @@ void ConvertToMixedPrecision(const std::string& model_file, framework::Scope scope; auto program_desc = inference::Load(&executor, &scope, model_file, params_file); - auto graph = std::unique_ptr( + auto main_graph = std::unique_ptr( new framework::ir::Graph(*program_desc)); - ConvertAllFp64ToFp32(graph.get()); - ConvertTensorDtype(program_desc.get(), - graph.get(), - black_list, - keep_io_types, - backend, - mixed_precision); - FixCastAttr(graph.get()); - - framework::ProgramDesc mixed_program_desc; - framework::ir::GraphToProgram(*graph, &mixed_program_desc); - - auto parameters = scope.LocalVarNames(); - std::sort(parameters.begin(), parameters.end()); - - auto serialize_params = - [](framework::Scope* scope, - const std::vector& params) -> std::string { - std::ostringstream os; - phi::CPUContext ctx; - for (const auto& param : params) { - VLOG(3) << "Serialize param: " << param; - PADDLE_ENFORCE_NOT_NULL( - scope->FindVar(param), - platform::errors::NotFound( - "Block should already have a '%s' variable", param)); - auto* tensor = scope->FindVar(param)->GetMutable(); - framework::SerializeToStream(os, *tensor, ctx); - } - return os.str(); - }; - - std::unordered_set weights_should_be_fp32; - for (auto* node : graph->Nodes()) { - if (!(node->IsVar() && !node->IsCtrlVar())) continue; - if (node->Var()->GetType() == - paddle::framework::proto::VarType::SELECTED_ROWS || - node->Var()->GetType() == - paddle::framework::proto::VarType::LOD_TENSOR || - node->Var()->GetType() == - paddle::framework::proto::VarType::LOD_TENSOR_ARRAY || - node->Var()->GetType() == paddle::framework::proto::VarType::STRINGS || - node->Var()->GetType() == paddle::framework::proto::VarType::VOCAB) { - if (node->Var()->Persistable() && - node->Var()->GetDataType() == - paddle::framework::proto::VarType::FP32) { - VLOG(2) << "weights keep to fp32: " << node->Name(); - weights_should_be_fp32.insert(node->Name()); - } - } - } - - for (const auto& param_name : parameters) { - auto* var = scope.FindLocalVar(param_name); - if (var->IsType() || - var->IsType()) { - auto* t = var->GetMutable(); - framework::Tensor mixed_tensor; - mixed_tensor.Resize(t->dims()); - auto* data = t->mutable_data(platform::CPUPlace()); - - if (mixed_precision == phi::DataType::FLOAT16 && - !weights_should_be_fp32.count(param_name)) { - mixed_tensor.set_type(paddle::experimental::DataType::FLOAT16); - auto* mixed_data = - mixed_tensor.mutable_data(platform::CPUPlace()); - for (int i = 0; i < t->numel(); i++) { - mixed_data[i] = static_cast(data[i]); - } - t->clear(); - paddle::framework::TensorCopySync(mixed_tensor, place, t); - } else if (mixed_precision == phi::DataType::BFLOAT16 && - !weights_should_be_fp32.count(param_name)) { - mixed_tensor.set_type(paddle::experimental::DataType::BFLOAT16); - auto* mixed_data = - mixed_tensor.mutable_data(platform::CPUPlace()); - for (int i = 0; i < t->numel(); i++) { - mixed_data[i] = static_cast(data[i]); - } - t->clear(); - paddle::framework::TensorCopySync(mixed_tensor, place, t); - } - } + std::unordered_map + vars_in_multi_block_map; + FindVarsInMultiBlock(program_desc.get(), &vars_in_multi_block_map); + + for (size_t i = 0; i < main_graph->SubGraphsSize(); ++i) { + auto graph = main_graph->GetSubGraph(i); + VLOG(2) << " -------- handle subgraph " << i << ", has " + << graph->Nodes().size() << " nodes"; + + program_desc->Block(i).LocalVarNames(); + + ConvertAllFp64ToFp32(graph); + ConvertTensorDtype(program_desc.get(), + graph, + black_list, + keep_io_types, + backend, + mixed_precision, + i == 0, + &vars_in_multi_block_map); + FixCastAttr(graph); } - auto StrToBinary = [](const std::string& path, const std::string& str) { - std::ofstream file(path.c_str(), std::ios::binary); - file.write(str.c_str(), str.size()); - file.close(); - }; - StrToBinary(mixed_model_file, - mixed_program_desc.Proto()->SerializeAsString()); - StrToBinary(mixed_params_file, serialize_params(&scope, parameters)); + framework::ProgramDesc mixed_program_desc; + framework::ir::GraphToProgram(*main_graph, &mixed_program_desc); + + SaveMixedModel(main_graph.get(), + &scope, + &mixed_program_desc, + mixed_model_file, + mixed_params_file, + mixed_precision); } } // namespace analysis diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index ae90618f5207c..2492590131260 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -410,6 +410,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { pass_builder_->DeletePass(ps); } } + + for (auto &delete_pass : other.pass_builder()->GetAllDeletedPasses()) { + pass_builder_->DeletePass(delete_pass); + } } void AnalysisConfig::EnableCUDNN() { @@ -513,7 +517,7 @@ MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const { } void AnalysisConfig::EnableTensorRtEngine( - int workspace_size, + int64_t workspace_size, int max_batch_size, int min_subgraph_size, AnalysisConfig::Precision precision_mode, diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index bde92c13b4cb2..9c673dfc57574 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -2096,6 +2096,8 @@ USE_TRT_CONVERTER(preln_residual_bias) USE_TRT_CONVERTER(c_allreduce_sum) USE_TRT_CONVERTER(roll) USE_TRT_CONVERTER(strided_slice) +USE_TRT_CONVERTER(rnn) +USE_TRT_CONVERTER(fill_constant_batch_size_like) USE_TRT_CONVERTER(transformer_input_convert) USE_TRT_CONVERTER(cast) USE_TRT_CONVERTER(recover_padding) diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 08d0e073babc1..b925a0c361f94 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -523,7 +523,7 @@ struct PD_INFER_DECL AnalysisConfig { /// quantization). /// /// - void EnableTensorRtEngine(int workspace_size = 1 << 20, + void EnableTensorRtEngine(int64_t workspace_size = 1 << 30, int max_batch_size = 1, int min_subgraph_size = 3, Precision precision = Precision::kFloat32, @@ -967,7 +967,7 @@ struct PD_INFER_DECL AnalysisConfig { bool use_tensorrt_{false}; // For workspace_size, refer it from here: // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting - int tensorrt_workspace_size_{1 << 30}; + int64_t tensorrt_workspace_size_{1 << 30}; // While TensorRT allows an engine optimized for a given max batch size // to run at any smaller size, the performance for those smaller // sizes may not be as well-optimized. Therefore, Max batch is best diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 258b0fae16e75..a4b2052390f82 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -167,6 +167,7 @@ const std::vector kGpuLowerPrecisionPasses{ "gpu_cpu_map_matmul_v2_to_matmul_pass", "fc_fuse_pass", "fc_elementwise_layernorm_fuse_pass", + "runtime_context_cache_pass", }; const std::vector kTrtLowerPrecisionPasses{ @@ -301,12 +302,12 @@ void CpuPassStrategy::EnableMKLDNN() { // "conv3d_bias_mkldnn_fuse_pass", // "conv_elementwise_add_mkldnn_fuse_pass", "conv_concat_relu_mkldnn_fuse_pass", - "conv_activation_mkldnn_fuse_pass", // - "scale_matmul_fuse_pass", // - "reshape_transpose_matmul_mkldnn_fuse_pass", // - "reshape_transpose_matmul_v2_mkldnn_fuse_pass", // - "matmul_transpose_reshape_fuse_pass", // - "matmul_v2_transpose_reshape_fuse_pass", // + "conv_activation_mkldnn_fuse_pass", // + "scale_matmul_fuse_pass", // + "reshape_transpose_matmul_mkldnn_fuse_pass", // + "matmul_transpose_reshape_mkldnn_fuse_pass", // + "matmul_elementwise_add_mkldnn_fuse_pass", // + "matmul_activation_mkldnn_fuse_pass", // // Disabled due to topology-dependent speed-up // "fc_mkldnn_pass", // "fc_act_mkldnn_fuse_pass", @@ -315,7 +316,6 @@ void CpuPassStrategy::EnableMKLDNN() { "softplus_activation_mkldnn_fuse_pass", // "shuffle_channel_mkldnn_detect_pass", // "elt_act_mkldnn_fuse_pass", // - "matmul_activation_mkldnn_fuse_pass", // // TODO(intel): Please fix the bug on windows. // https://github.com/PaddlePaddle/Paddle/issues/29710 // "mkldnn_inplace_pass", // This pass should be activated after @@ -400,14 +400,12 @@ void CpuPassStrategy::EnableMkldnnInt8() { passes_.push_back("repeated_fc_relu_fuse_pass"); passes_.push_back("fc_mkldnn_pass"); passes_.push_back("fc_act_mkldnn_fuse_pass"); - passes_.push_back("matmul_transpose_reshape_fuse_pass"); - passes_.push_back("matmul_v2_transpose_reshape_fuse_pass"); + passes_.push_back("matmul_transpose_reshape_mkldnn_fuse_pass"); passes_.push_back("batch_norm_act_fuse_pass"); passes_.push_back("softplus_activation_mkldnn_fuse_pass"); passes_.push_back("compute_propagate_scales_mkldnn_pass"); passes_.push_back("scale_matmul_fuse_pass"); passes_.push_back("reshape_transpose_matmul_mkldnn_fuse_pass"); - passes_.push_back("reshape_transpose_matmul_v2_mkldnn_fuse_pass"); passes_.push_back("cpu_quantize_placement_pass"); passes_.push_back("cpu_quantize_pass"); passes_.push_back("cpu_quantize_squash_pass"); diff --git a/paddle/fluid/inference/capi/paddle_c_api.h b/paddle/fluid/inference/capi/paddle_c_api.h index 1b8bd08b76bad..25ede726b144b 100644 --- a/paddle/fluid/inference/capi/paddle_c_api.h +++ b/paddle/fluid/inference/capi/paddle_c_api.h @@ -214,7 +214,7 @@ PADDLE_CAPI_EXPORT extern bool PD_SpecifyInputName( PADDLE_CAPI_EXPORT extern void PD_EnableTensorRtEngine( PD_AnalysisConfig* config, - int workspace_size, + int64_t workspace_size, int max_batch_size, int min_subgraph_size, Precision precision, diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc index b6d865ff3490c..45fd2e45c1991 100644 --- a/paddle/fluid/inference/capi/pd_config.cc +++ b/paddle/fluid/inference/capi/pd_config.cc @@ -243,7 +243,7 @@ bool PD_SpecifyInputName(const PD_AnalysisConfig* config) { } void PD_EnableTensorRtEngine(PD_AnalysisConfig* config, - int workspace_size, + int64_t workspace_size, int max_batch_size, int min_subgraph_size, Precision precision, diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc index a72497940d9da..b183ba8c63b25 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.cc +++ b/paddle/fluid/inference/capi_exp/pd_config.cc @@ -219,7 +219,7 @@ PD_Bool PD_ConfigIrOptim(__pd_keep PD_Config* pd_config) { } void PD_ConfigEnableTensorRtEngine(__pd_keep PD_Config* pd_config, - int32_t workspace_size, + int64_t workspace_size, int32_t max_batch_size, int32_t min_subgraph_size, PD_PrecisionType precision, diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h index 9e06d8c72f048..a7054d5390838 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.h +++ b/paddle/fluid/inference/capi_exp/pd_config.h @@ -329,7 +329,7 @@ PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIrOptim( /// PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtEngine( __pd_keep PD_Config* pd_config, - int32_t workspace_size, + int64_t workspace_size, int32_t max_batch_size, int32_t min_subgraph_size, PD_PrecisionType precision, diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 4a13b6c00ac00..4f563c2df8ee2 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -69,6 +69,8 @@ list( top_k_op.cc squeeze2_op.cc unsqueeze2_op.cc + rnn_op.cc + fill_constant_batch_size_like_op.cc sum_op.cc shape_op.cc fill_constant_op.cc diff --git a/paddle/fluid/inference/tensorrt/convert/fill_constant_batch_size_like_op.cc b/paddle/fluid/inference/tensorrt/convert/fill_constant_batch_size_like_op.cc new file mode 100644 index 0000000000000..5f00777a663af --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/fill_constant_batch_size_like_op.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class FillConstantBatchSizeLikeOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, + bool test_mode) override { +#if IS_TRT_VERSION_GE(7000) + VLOG(4) << "convert a fluid fill_constant_batch_size_like op to tensorrt " + "fill_constant_batch_size_like layer"; + + framework::OpDesc op_desc(op, nullptr); + auto* input = engine_->GetITensor(op_desc.Input("Input")[0]); + int dtype = PADDLE_GET_CONST(int, op_desc.GetAttr("dtype")); + // be float + PADDLE_ENFORCE_EQ(dtype, + 5, + platform::errors::InvalidArgument( + "fill_constant_batch_size_like's input data type " + "must be float in Paddle-TRT.")); + + int input_dim_idx = PADDLE_GET_CONST(int, op_desc.GetAttr("input_dim_idx")); + size_t output_dim_idx = + PADDLE_GET_CONST(int, op_desc.GetAttr("output_dim_idx")); + std::string str_value = + PADDLE_GET_CONST(std::string, op_desc.GetAttr("str_value")); + std::vector shape = + PADDLE_GET_CONST(std::vector, op_desc.GetAttr("shape")); + float value = std::stof(str_value); + + auto* input_shape_tensor = Shape(input); + auto* batch_tensor = GetEleTensorOfShape(input_shape_tensor, input_dim_idx); + std::string name = "_add_fill_constant_batch_size_like_op_"; + auto shape_attr_tensor = Add1DConstantLayer(shape, name + "shape_attr"); + std::vector gather_out_shape_indices; + for (size_t i = 0; i < shape.size(); i++) { + if (i == output_dim_idx) { + gather_out_shape_indices.push_back(shape.size()); + continue; + } + gather_out_shape_indices.push_back(i); + } + std::vector concat_inputs{shape_attr_tensor, + batch_tensor}; + auto out_shape_tensor = + Gather(Concat(concat_inputs), gather_out_shape_indices); + auto layer = TRT_ENGINE_ADD_LAYER( + engine_, Fill, nvinfer1::Dims{}, nvinfer1::FillOperation::kLINSPACE); + std::vector value_vec(1, value); + std::vector beta_vec(3, 0.); + layer->setAlpha(value); + layer->setBeta(0.f); + layer->setInput(0, *out_shape_tensor); + layer->setInput(1, *Add1DConstantLayer(value_vec, name + "alpha", true)); + layer->setInput(2, *Add1DConstantLayer(beta_vec, name + "beta", false)); + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput( + layer, "fill_constant_batch_size_like", {output_name}, test_mode); +#endif + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(fill_constant_batch_size_like, + FillConstantBatchSizeLikeOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index 7c5654bf17c04..5e4920e6517ec 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -318,6 +318,10 @@ class MultiheadMatMulOpConverter : public OpConverter { engine_->GetFp32TrtWeight(biasqk_name, *biasqk_t); biasQK_constLayer = TRT_ENGINE_ADD_LAYER( engine_, Constant, biasqk_dims, biasqk_const_weight.get()); + float* biasqk_data = const_cast(static_cast( + engine_->GetFp32TrtWeight(biasqk_name, *biasqk_t).get().values)); + printf("@@ in convert biasqk_data 0 1 2 3: %f %f %f %f \r\n",biasqk_data[0],biasqk_data[1],biasqk_data[2],biasqk_data[3]); + engine_->SetITensor(biasqk_name,biasQK_constLayer->getOutput(0)); op_desc.SetInput("BiasQK",{biasqk_name}); } diff --git a/paddle/fluid/inference/tensorrt/convert/rnn_op.cc b/paddle/fluid/inference/tensorrt/convert/rnn_op.cc new file mode 100644 index 0000000000000..945495c0d1623 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/rnn_op.cc @@ -0,0 +1,320 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class RnnNativeOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, + bool test_mode) override { +#if IS_TRT_VERSION_GE(7000) + VLOG(4) << "convert a fluid rnn op to tensorrt rnn layer"; + + framework::OpDesc op_desc(op, nullptr); + // [seq_len, batch ,in_size], + // [K * num_layers, batch ,in_size], [K * num_layers, batch ,in_size] + // K is defined below + auto* input = engine_->GetITensor(op_desc.Input("Input")[0]); + auto* prev_c = engine_->GetITensor(op_desc.Input("PreState")[0]); + auto* prev_h = engine_->GetITensor(op_desc.Input("PreState")[1]); + + PADDLE_ENFORCE_EQ(input->getDimensions().nbDims, + 3, + platform::errors::InvalidArgument( + "RNN(LSTM)'s input must be 3 dimensions, i.e. " + "[seq_len, batch, input_size]," + "but now is %d dimensions.", + input->getDimensions().nbDims)); + + PADDLE_ENFORCE_EQ(prev_h->getDimensions().nbDims, + 3, + platform::errors::InvalidArgument( + "RNN(LSTM)'s PreState(Hidden) must be 3 dimensions, " + "i.e. [num_layers, batch, hidden_size]," + "but now is %d dimensions.", + prev_h->getDimensions().nbDims)); + + PADDLE_ENFORCE_EQ(prev_c->getDimensions().nbDims, + 3, + platform::errors::InvalidArgument( + "RNN(LSTM)'s PreState(Cell) must be 3 dimensions, " + "i.e. [num_layers, batch, hidden_size]," + "but now is %d dimensions.", + prev_c->getDimensions().nbDims)); + + int num_layers = PADDLE_GET_CONST(int, op_desc.GetAttr("num_layers")); + int hidden_size = PADDLE_GET_CONST(int, op_desc.GetAttr("hidden_size")); + int input_size = PADDLE_GET_CONST(int, op_desc.GetAttr("input_size")); + bool is_bidirec = PADDLE_GET_CONST(bool, op_desc.GetAttr("is_bidirec")); + int K = is_bidirec ? 2 : 1; + + // extract weights + // if is_bidirec, make forward and backward weight/bias concated + std::vector weight_bias_vec; + for (int layer_id = 0; layer_id < num_layers; layer_id++) { + if (is_bidirec) { + auto extract_and_combine_weight = [&](int start) { + // k and k + 2 is combined ! + // k + 1 and k + 3 is combined ! + for (int k = 0; k < K; k++) { + std::string var0_name = op_desc.Input("WeightList")[k + start]; + std::string var1_name = op_desc.Input("WeightList")[k + 2 + start]; + auto* var0_v = scope.FindVar(var0_name); + auto* var1_v = scope.FindVar(var1_name); + auto* var0_t = var0_v->GetMutable(); + auto* var1_t = var1_v->GetMutable(); + const float* data0_ptr = reinterpret_cast( + engine_->GetTrtWeight(var0_name, *var0_t).get().values); + const float* data1_ptr = reinterpret_cast( + engine_->GetTrtWeight(var1_name, *var1_t).get().values); + float* data_ptr = new float[K * var0_t->numel()]; + // remember free + memcpy(data_ptr, data0_ptr, sizeof(float) * var0_t->numel()); + memcpy(data_ptr + var0_t->numel(), + data1_ptr, + sizeof(float) * var1_t->numel()); + weight_bias_vec.push_back(data_ptr); + } + }; + extract_and_combine_weight(4 * layer_id); + extract_and_combine_weight(4 * layer_id + 4 * num_layers); + } else { + auto extract_weight = [&](int start) { + for (int k = 0; k < 2 * K; k++) { + std::string var_name = op_desc.Input("WeightList")[k + start]; + auto* var_v = scope.FindVar(var_name); + auto* var_t = var_v->GetMutable(); + const float* data_ptr = reinterpret_cast( + engine_->GetTrtWeight(var_name, *var_t).get().values); + weight_bias_vec.push_back(data_ptr); + } + }; + extract_weight(2 * layer_id); // filter + extract_weight(2 * num_layers + 2 * layer_id); // bias + } + } + // [seq_len, batch ,in_size] + + nvinfer1::ITensor* this_input = + TRT_ENGINE_ADD_LAYER(engine_, Identity, *input)->getOutput(0); + + nvinfer1::ILayer* finally_layer = nullptr; + for (int layer_id = 0; layer_id < num_layers; layer_id++) { + auto* loop = TRT_ENGINE_ADD_LAYER(engine_, Loop); + auto* input_shape_tensor = Shape(this_input); + auto* seq_len_scalar = GetEleTensorOfShape(input_shape_tensor, 0, true); + auto* seq_len_tensor = GetEleTensorOfShape(input_shape_tensor, 0); + auto* batch_tensor = GetEleTensorOfShape(input_shape_tensor, 1); + auto* K_tensor = Add1DConstantLayer(K); + auto* hidden_size_tensor = Add1DConstantLayer(hidden_size); + + if (layer_id > 0) input_size = K * hidden_size; + auto* input_size_tensor = Add1DConstantLayer(input_size); + + loop->addTripLimit(*seq_len_scalar, nvinfer1::TripLimit::kCOUNT); + + nvinfer1::ITensor* iter_input_tensor; + auto* iter_input_forward_tensor = + loop->addIterator(*this_input)->getOutput(0); // [batch, input_size] + + // this function shuffle tensor -> 4 dims + auto reshape2four = [&](nvinfer1::ITensor** tensor) { +#if TRT_VERSION == 7234 + auto* tmp_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, **tensor); + std::vector concat_inputs{ + Add1DConstantLayer(1), Add1DConstantLayer(1), Shape(*tensor)}; + tmp_layer->setInput(1, *Concat(concat_inputs)); + *tensor = tmp_layer->getOutput(0); +#endif + }; + + reshape2four(&iter_input_forward_tensor); + + if (is_bidirec) { + auto* iter_input_reverse_tensor = + loop->addIterator(*this_input, 0, true) + ->getOutput(0); // [batch, input_size] + + reshape2four(&iter_input_reverse_tensor); + + std::vector concat_inputs{ + iter_input_forward_tensor, iter_input_reverse_tensor}; + iter_input_tensor = Concat(concat_inputs); + } else { + iter_input_tensor = iter_input_forward_tensor; + } + + auto* tmp_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *iter_input_tensor); + + tmp_layer->setInput(1, + *Concat(std::vector{ + K_tensor, batch_tensor, input_size_tensor})); + + iter_input_tensor = tmp_layer->getOutput(0); + // [K, batch, input_size] + + std::vector tmp_vec(K); + std::iota(tmp_vec.begin(), tmp_vec.end(), 2 * layer_id); + auto* first_prev_h = Gather(prev_h, tmp_vec); + auto* first_prev_c = Gather(prev_c, tmp_vec); + + nvinfer1::IRecurrenceLayer* Hlayer = loop->addRecurrence(*first_prev_h); + nvinfer1::IRecurrenceLayer* Clayer = loop->addRecurrence(*first_prev_c); + + // k is weight + // k + 2 is bias + auto run_matmul_bias = [&](int k, bool is_input) -> nvinfer1::ITensor* { + int h = 4 * hidden_size; + int w = is_input ? input_size : hidden_size; + if (is_input && k > 0) w = K * hidden_size; + + auto weight_shape = nvinfer1::Dims3{K, h, w}; + auto* weight_tensor = + AddConstantLayer(weight_bias_vec[k], weight_shape, " "); + auto bias_shape = nvinfer1::Dims3{K, 1, h}; + auto* bias_tensor = + AddConstantLayer(weight_bias_vec[k + 2], bias_shape, " "); + + nvinfer1::ITensor* iter_tensor = + k % 2 ? Hlayer->getOutput(0) : iter_input_tensor; + + auto* iter_w_tensor = + TRT_ENGINE_ADD_LAYER(engine_, + MatrixMultiply, + *iter_tensor, + nvinfer1::MatrixOperation::kNONE, + *weight_tensor, + nvinfer1::MatrixOperation::kTRANSPOSE) + ->getOutput(0); + + auto* iter_w_b_tensor = Sum(iter_w_tensor, bias_tensor); + return iter_w_b_tensor; + }; + + nvinfer1::ITensor* iter_input_w_b_tensor = + run_matmul_bias(layer_id * 4, true); + nvinfer1::ITensor* iter_hidden_w_b_tensor = + run_matmul_bias(layer_id * 4 + 1, false); + auto* iter_input_hidden_add_tensor = + Sum(iter_input_w_b_tensor, iter_hidden_w_b_tensor); + + nvinfer1::Dims start_dims = nvinfer1::Dims3{0, 0, 0}; + nvinfer1::Dims size_dims = nvinfer1::Dims3{0, 0, 0}; + auto* size_dims_tensor = Concat(std::vector{ + K_tensor, batch_tensor, hidden_size_tensor}); + nvinfer1::Dims step_dims = nvinfer1::Dims3{1, 1, 1}; + + std::vector lstm_act{ + nvinfer1::ActivationType::kSIGMOID, nvinfer1::ActivationType::kTANH}; + + auto split_gate = [&](int i, int act_i = 0) -> nvinfer1::ITensor* { + start_dims.d[2] = i * hidden_size; + auto* gate_layer = TRT_ENGINE_ADD_LAYER(engine_, + Slice, + *iter_input_hidden_add_tensor, + start_dims, + size_dims, + step_dims); + gate_layer->setInput(2, *size_dims_tensor); + auto* gate = gate_layer->getOutput(0); + gate = Act(gate, lstm_act[act_i]); + return gate; + }; + + auto* i_gate = split_gate(0); + auto* f_gate = split_gate(1); + auto* c_gate = split_gate(2, 1); + auto* o_gate = split_gate(3); + + // C_t = i_gate * c_gate + f_gate * C_{t-1} + auto* ic_gate = Prod(i_gate, c_gate); + auto* fCt1_gate = Prod(f_gate, Clayer->getOutput(0)); + auto* Ct = Sum(ic_gate, fCt1_gate); + Clayer->setInput(1, *Ct); + // H_t = tanh(C_t) * o_gate + auto* tanh_Ct = Act(Ct, lstm_act[1]); + auto* Ht = Prod(o_gate, tanh_Ct); + Hlayer->setInput(1, *Ht); + + // Ht: [K, batch, hidden_size] + nvinfer1::ILayer* layer = nullptr; + nvinfer1::ITensor* tensor = nullptr; + if (is_bidirec) { + auto* slice_forward_layer = + TRT_ENGINE_ADD_LAYER(engine_, + Slice, + *Ht, + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{1, 1, 1}); + auto* slice_reverse_layer = + TRT_ENGINE_ADD_LAYER(engine_, + Slice, + *Ht, + nvinfer1::Dims3{1, 0, 0}, + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{1, 1, 1}); + auto* one_tensor = Add1DConstantLayer(1); + auto* size_dims_tensor = Concat(std::vector{ + one_tensor, batch_tensor, hidden_size_tensor}); + slice_forward_layer->setInput(2, *size_dims_tensor); + slice_reverse_layer->setInput(2, *size_dims_tensor); + + auto* layer0 = loop->addLoopOutput(*slice_forward_layer->getOutput(0), + nvinfer1::LoopOutput::kCONCATENATE); + auto* layer1 = loop->addLoopOutput(*slice_reverse_layer->getOutput(0), + nvinfer1::LoopOutput::kREVERSE); + layer0->setInput(1, *seq_len_scalar); + layer1->setInput(1, *seq_len_scalar); + + std::vector concat_inputs{layer0->getOutput(0), + layer1->getOutput(0)}; + tensor = Concat(concat_inputs, 3); + } else { + layer = loop->addLoopOutput(*Ht, nvinfer1::LoopOutput::kCONCATENATE); + layer->setInput(1, *seq_len_scalar); + tensor = layer->getOutput(0); + } + finally_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *tensor); + auto* hidden_size_k_tensor = Add1DConstantLayer(hidden_size * K); + nvinfer1::ITensor* final_dims_tensor = + Concat(std::vector{ + seq_len_tensor, batch_tensor, hidden_size_k_tensor}); + finally_layer->setInput(1, *final_dims_tensor); + // update input + this_input = finally_layer->getOutput(0); + } + + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(finally_layer, "rnn", {output_name}, test_mode); + // free + if (is_bidirec) { + for (size_t i = 0; i < weight_bias_vec.size(); i++) + delete[] weight_bias_vec[i]; + } +#endif + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(rnn, RnnNativeOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 9b80aeb1d4938..d65273ac01889 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -79,7 +79,7 @@ class TRTConvertValidation { TRTConvertValidation(int max_batch_size, const std::unordered_set& parameters, framework::Scope& scope, // NOLINT - int workspace_size = 1 << 10, + int64_t workspace_size = 1 << 30, bool if_add_batch = true) : parameters_(parameters), scope_(scope), diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index fcd28ec749cd8..56a8987e641a6 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -206,7 +206,7 @@ class TensorRTEngine { TensorRTEngine( int max_batch, - int max_workspace, + int64_t max_workspace, AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32, TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, @@ -672,7 +672,7 @@ class TensorRTEngine { // the runtime batch size static int runtime_batch_; // the max memory size the engine uses - int max_workspace_; + int64_t max_workspace_; AnalysisConfig::Precision precision_; TRTInt8Calibrator* calibrator_; @@ -767,7 +767,7 @@ class TRTEngineManager { TensorRTEngine* Create( std::string name, int max_batch, - int max_workspace, + int64_t max_workspace, AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32, TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index e9298f4fc5a2e..255ddf3704aef 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -40,6 +40,10 @@ struct SimpleOpTypeSetTeller : public Teller { #if IS_TRT_VERSION_GE(7000) teller_set.insert("tile"); teller_set.insert("flatten_contiguous_range"); + teller_set.insert("rnn"); + int8_teller_set.insert("rnn"); + teller_set.insert("fill_constant_batch_size_like"); + int8_teller_set.insert("fill_constant_batch_size_like"); #endif #if CUDA_VERSION >= 10020 teller_set.insert("reshape"); @@ -1250,6 +1254,57 @@ bool OpTeller::Tell(const framework::ir::Node* node, } } + if (op_type == "rnn") { + if (!with_dynamic_shape) { + return false; + } + if (desc.HasAttr("mode")) { + std::string mode = PADDLE_GET_CONST(std::string, desc.GetAttr("mode")); + if (mode != "LSTM") return false; + } + if (desc.HasAttr("dropout_prob")) { + float dropout_prob = + PADDLE_GET_CONST(float, desc.GetAttr("dropout_prob")); + if (dropout_prob > 1e-5) return false; + } + // not support following four inputs for rnn in paddle-trt + auto rnn_inputs = desc.Inputs(); + if (rnn_inputs.find("SequenceLength") != rnn_inputs.end()) { + if (desc.Input("SequenceLength").size()) { + return false; + } + } + } + + if (op_type == "fill_constant_batch_size_like") { + if (!with_dynamic_shape) { + return false; + } + if (!desc.HasAttr("input_dim_idx")) { + return false; + } + if (!desc.HasAttr("output_dim_idx")) { + return false; + } + if (!desc.HasAttr("shape")) { + return false; + } + auto* block = desc.Block(); + if (block == nullptr) { + VLOG(3) << "The block desc is nullptr, we can't continue to analyze. " + "Developers need to check whether block_desc is passed in " + "the pass."; + return false; + } + auto x_var_name = desc.Input("Input")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + auto dtype = x_var_desc->GetDataType(); + // At present, only support float32 into trt. + if (dtype != 5) { + return false; + } + } + if (op_type == "slice") { if (desc.HasAttr("decrease_axis")) { std::vector decrease_axis = @@ -2089,6 +2144,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, #if !IS_TRT_VERSION_GE(7000) return false; #endif + if (!(desc.HasAttr("in_dtype") && desc.HasAttr("out_dtype"))) { + VLOG(3) << "the " << op_type + << " does not have attr (in_dtype or " + "out_dtype)"; + return false; + } int in_dtype = PADDLE_GET_CONST(int, desc.GetAttr("in_dtype")); int out_dtype = PADDLE_GET_CONST(int, desc.GetAttr("out_dtype")); if ((in_dtype == 4 || in_dtype == 5) && out_dtype == 4) { diff --git a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu index f47a65d104513..046a725b2bd1e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu @@ -23,6 +23,7 @@ #include "NvInferRuntimeCommon.h" #include "paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/gpu/gpu_helper.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index 9602e6c87903a..fb4c2b63d6845 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -298,6 +298,24 @@ __global__ void broadcast(const T *src, } } +template +__global__ void broadcast_batch(const T *src, + T *dst, + const int seq_len, + const int head_num, + const int window_num) { + int WindownumHeadSeqlen_id = blockIdx.x % (window_num * head_num * seq_len); + int dst_offset = blockIdx.x * seq_len; + if (threadIdx.x < seq_len) { + dst[threadIdx.x + dst_offset] = src[threadIdx.x+WindownumHeadSeqlen_id*seq_len]; + } +} + +// TODO wangbojun for debug +__global__ void print_float(const float *src, int index){ + printf("%f:",src[index]); +} + int QkvToContextPluginDynamic::enqueue( const nvinfer1::PluginTensorDesc *input_desc, const nvinfer1::PluginTensorDesc *output_desc, @@ -329,6 +347,7 @@ int QkvToContextPluginDynamic::enqueue( // fit to [batch, head_num, length, length] + [batch, 1, 1, length] framework::Tensor temp_qk_bias_tensor; float *qk_bias = const_cast(static_cast(inputs[1])); + if (ProductDim(input_desc[1].dims) == (batch * seq_len)) { temp_qk_bias_tensor.Resize({batch, head_number_, seq_len, seq_len}); auto *temp_qk_bias = temp_qk_bias_tensor.mutable_data( @@ -342,7 +361,36 @@ int QkvToContextPluginDynamic::enqueue( head_number_); qk_bias = temp_qk_bias; } + // if bias_qk is [window_num,head_number,seq_len,seq_len] + // in swin SW-MSA block dim[0] of input is batch_number*windows_number + // therefore, we broadcast bias_qk to [Batch_num*window_num, head_number, seq_len, seq_len] + int window_num=input_desc[1].dims.d[0]; + if(ProductDim(input_desc[1].dims)==window_num*head_number_*seq_len*seq_len){ + temp_qk_bias_tensor.Resize({batch, head_number_, seq_len, seq_len}); + auto *temp_qk_bias = temp_qk_bias_tensor.mutable_data( + platform::CUDAPlace(device_id)); + int grid = batch * head_number_ * seq_len; + int block = round_up(seq_len); + broadcast_batch<<>>( + static_cast(inputs[1]), + temp_qk_bias, + seq_len, + head_number_, + window_num); + qk_bias = temp_qk_bias; + } + + printf("@@@ input_desc[0] shape: %d, %d, %d \r\n",input_desc[0].dims.d[0],input_desc[0].dims.d[1],input_desc[0].dims.d[2]); + printf("@@@ input_desc[1] shape: %d, %d, %d, %d \r\n",input_desc[1].dims.d[0],input_desc[1].dims.d[1],input_desc[1].dims.d[2],input_desc[1].dims.d[3]); + printf("\r\n"); + const float *input1_data = static_cast(qk_bias); + printf("@@@ in plugin biasqk 0 1 2 3: "); + print_float<<<1,1,0,stream>>>(input1_data,0); + print_float<<<1,1,0,stream>>>(input1_data,1); + print_float<<<1,1,0,stream>>>(input1_data,2); + print_float<<<1,1,0,stream>>>(input1_data,3); + printf("\r\n"); // BxSx3xNxH => tptr: 3xBxNxSxH. TransposeQKV( batch, seq_len, head_size_, head_number_, input0_data, tptr, stream); @@ -398,6 +446,27 @@ int QkvToContextPluginDynamic::enqueue( head_number_); qk_bias = temp_qk_bias; } + // if bias_qk is [window_num,head_number,seq_len,seq_len] + // in swin SW-MSA block dim[0] of input is batch_number*windows_number + // therefore, we broadcast bias_qk to [Batch_num*window_num, head_number, seq_len, seq_len] + int window_num=input_desc[1].dims.d[0]; + if(ProductDim(input_desc[1].dims)==window_num*head_number_*seq_len*seq_len){ + temp_qk_bias_tensor.Resize({batch, head_number_, seq_len, seq_len}); + auto *temp_qk_bias = + reinterpret_cast(temp_qk_bias_tensor.mutable_data( + platform::CUDAPlace(device_id))); + int grid = batch * head_number_ * seq_len; + int block = round_up(seq_len); + broadcast_batch<<>>( + static_cast(inputs[1]), + temp_qk_bias, + seq_len, + head_number_, + window_num); + qk_bias = temp_qk_bias; + } + + const half *input1_data = static_cast(qk_bias); // BxSx3xNxH => tptr: 3xBxNxSxH. TransposeQKV( diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index cc5e44686f2b1..537b5ef1e6e03 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -126,7 +126,6 @@ void PrepareInputs(std::vector *input_slots, init_zero_tensor.lod.assign({one_batch.lod3}); lod_tensor_tensor.shape = rnn_link_data_shape; lod_tensor_tensor.lod.assign({one_batch.lod1}); - // clang-format off week_tensor.shape.assign( {static_cast(one_batch.rnn_week_datas.size()), static_cast(one_batch.rnn_week_datas.front().size())}); @@ -135,7 +134,6 @@ void PrepareInputs(std::vector *input_slots, {static_cast(one_batch.rnn_minute_datas.size()), static_cast(one_batch.rnn_minute_datas.front().size())}); minute_tensor.lod.assign({one_batch.lod3}); - // clang-format on // assign data TensorAssignData(&lod_attention_tensor, std::vector>({{0, 0}})); diff --git a/paddle/fluid/jit/CMakeLists.txt b/paddle/fluid/jit/CMakeLists.txt index 38d2ae54de32d..3ad7b1b16cbc7 100644 --- a/paddle/fluid/jit/CMakeLists.txt +++ b/paddle/fluid/jit/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory(engine) proto_library(paddle_jit_property_proto SRCS property.proto) cc_library( @@ -30,11 +31,22 @@ cc_library( SRCS function_schema.cc DEPS jit_function_utils) +cc_library( + jit_function + SRCS function.cc + DEPS jit_function_utils jit_executor_engine jit_pe_engine) + cc_library( jit_layer SRCS layer.cc - DEPS jit_serializer jit_function_utils jit_serializer_utils - jit_compilation_unit jit_function_schema) + DEPS jit_serializer + jit_function_utils + jit_serializer_utils + jit_compilation_unit + jit_function_schema + jit_executor_engine + jit_pe_engine + jit_function) if(WITH_TESTING AND NOT WIN32) add_custom_target( diff --git a/paddle/fluid/jit/all.h b/paddle/fluid/jit/all.h index 5a571a72a2824..233d1dc981fb2 100644 --- a/paddle/fluid/jit/all.h +++ b/paddle/fluid/jit/all.h @@ -14,7 +14,7 @@ #pragma once -#include "base_function.h" -#include "layer.h" -#include "serializer.h" -#include "serializer_utils.h" +#include "function.h" //NOLINT +#include "layer.h" // NOLINT +#include "serializer.h" // NOLINT +#include "serializer_utils.h" // NOLINT diff --git a/paddle/fluid/jit/ast.h b/paddle/fluid/jit/ast.h deleted file mode 100644 index 535b3a89dd60f..0000000000000 --- a/paddle/fluid/jit/ast.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/variable.h" - -namespace paddle { -namespace jit { -using Variable = paddle::framework::Variable; -class BaseFunction; -class CompilationUnit; - -class ClassType { - public: - ClassType(const std::vector& names, - std::weak_ptr cu) - : const_names_(names), compilation_unit_(cu) {} - - static std::shared_ptr Create( - const std::vector& names, - std::weak_ptr cu) { - return std::make_shared(names, cu); - } - - // const std::vector Methods() const; - - // const Variable& GetAttribute(size_t slot) const; - // const Variable& GetAttribute(const std::string& name) const; - - // size_t AddAttribute(const std::string& name, Variable val); - - private: - // TODO(dev): disingwish parameter and buffer - std::vector const_names_; - std::vector const_value_; - - std::vector methods_; - std::vector static_method_; - std::weak_ptr compilation_unit_; -}; - -} // namespace jit -} // namespace paddle diff --git a/paddle/fluid/jit/compilation_unit.cc b/paddle/fluid/jit/compilation_unit.cc index 5a434fba176d3..0f241d864fe07 100644 --- a/paddle/fluid/jit/compilation_unit.cc +++ b/paddle/fluid/jit/compilation_unit.cc @@ -16,37 +16,27 @@ #include "paddle/phi/core/enforce.h" -#include "paddle/fluid/jit/base_function.h" +#include "paddle/fluid/jit/engine/base_engine.h" namespace paddle { namespace jit { -std::shared_ptr CompilationUnit::Function( +std::shared_ptr CompilationUnit::GetEngine( const std::string &name) const { PADDLE_ENFORCE_EQ( - function_map_.count(name), + engine_map_.count(name), 1, phi::errors::InvalidArgument( - "Funciton name %s is not exist in function_map_.", name)); - return function_map_.at(name); + "Funciton named %s is not existed in engine_map_.", name)); + return engine_map_.at(name); } -void CompilationUnit::SetFunction( - const std::string &name, const std::shared_ptr &function) { - function_map_[name] = function; +void CompilationUnit::SetEngine(const std::string &name, + const std::shared_ptr &engine) { + engine_map_[name] = engine; } -std::vector CompilationUnit::FunctionNames() const { - std::vector names; - for (auto it = function_map_.begin(); it != function_map_.end(); it++) { - names.emplace_back(it->first); - } - return names; -} - -const Name2FunctionMap &CompilationUnit::FunctionMap() const { - return function_map_; -} +const jit::EngineMap &CompilationUnit::EngineMap() const { return engine_map_; } } // namespace jit } // namespace paddle diff --git a/paddle/fluid/jit/compilation_unit.h b/paddle/fluid/jit/compilation_unit.h index 535e92fe88473..b862faa23f978 100644 --- a/paddle/fluid/jit/compilation_unit.h +++ b/paddle/fluid/jit/compilation_unit.h @@ -21,26 +21,23 @@ namespace paddle { namespace jit { -class BaseFunction; -using Name2FunctionMap = - std::unordered_map>; +class BaseEngine; +using EngineMap = std::unordered_map>; class CompilationUnit { public: CompilationUnit() = default; ~CompilationUnit() {} - std::shared_ptr Function(const std::string &name) const; + std::shared_ptr GetEngine(const std::string &name) const; - void SetFunction(const std::string &name, - const std::shared_ptr &function); + void SetEngine(const std::string &name, + const std::shared_ptr &engine); - std::vector FunctionNames() const; - - const Name2FunctionMap &FunctionMap() const; + const jit::EngineMap &EngineMap() const; private: - Name2FunctionMap function_map_; + jit::EngineMap engine_map_; }; } // namespace jit diff --git a/paddle/fluid/jit/engine/CMakeLists.txt b/paddle/fluid/jit/engine/CMakeLists.txt new file mode 100644 index 0000000000000..92a1f9582c931 --- /dev/null +++ b/paddle/fluid/jit/engine/CMakeLists.txt @@ -0,0 +1,9 @@ +cc_library( + jit_executor_engine + SRCS executor_engine.cc + DEPS executor) + +cc_library( + jit_pe_engine + SRCS pe_engine.cc + DEPS parallel_executor) diff --git a/paddle/fluid/jit/base_function.h b/paddle/fluid/jit/engine/base_engine.h similarity index 95% rename from paddle/fluid/jit/base_function.h rename to paddle/fluid/jit/engine/base_engine.h index 50dadaf4ae227..eaf3c1221c8a2 100644 --- a/paddle/fluid/jit/base_function.h +++ b/paddle/fluid/jit/engine/base_engine.h @@ -22,14 +22,14 @@ namespace jit { using Tensor = paddle::experimental::Tensor; using DenseTensor = phi::DenseTensor; -class BaseFunction { +class BaseEngine { public: virtual std::vector operator()( const std::vector &inputs) = 0; virtual std::vector operator()(const std::vector &inputs) = 0; - virtual ~BaseFunction() {} + virtual ~BaseEngine() {} }; } // namespace jit diff --git a/paddle/fluid/jit/engine/executor_engine.cc b/paddle/fluid/jit/engine/executor_engine.cc new file mode 100644 index 0000000000000..58d80426e5fba --- /dev/null +++ b/paddle/fluid/jit/engine/executor_engine.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/jit/engine/executor_engine.h" + +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/phi/core/enforce.h" + +namespace paddle { +namespace jit { + +ExecutorEngine::ExecutorEngine(const std::shared_ptr &info, + const VariableMap ¶ms_dict, + const phi::Place &place) + : info_(info), place_(place), inner_exe_(place_) { + info_->RemoveDescFeedFetch(); + PADDLE_ENFORCE_GT( + static_cast(info_->ProgramDesc().Block(0).OpSize()), + 0, + platform::errors::PreconditionNotMet( + "There is no operator in ProgramDesc.")); + utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, &scope_); + VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_); +} + +std::vector ExecutorEngine::operator()( + const std::vector &inputs) { + auto dense_tensors = utils::ToDenseTensors(inputs); + return utils::ToTensors(this->operator()(dense_tensors)); +} + +std::vector ExecutorEngine::operator()( + const std::vector &inputs) { + utils::ShareIntoScope(info_->InputArgNames(), inputs, &scope_); + inner_exe_.Run(info_->ProgramDesc(), + &scope_, + /*blockID=*/0, + false, + true, + info_->OutputArgNames()); + std::vector outputs; + utils::FetchOuts(info_->OutputArgNames(), scope_, &outputs); + return outputs; +} + +const std::shared_ptr &ExecutorEngine::Info() const { + return info_; +} + +} // namespace jit +} // namespace paddle diff --git a/paddle/fluid/jit/engine/executor_engine.h b/paddle/fluid/jit/engine/executor_engine.h new file mode 100644 index 0000000000000..a39cf85020c1b --- /dev/null +++ b/paddle/fluid/jit/engine/executor_engine.h @@ -0,0 +1,51 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/scope.h" + +#include "paddle/fluid/jit/engine/base_engine.h" +#include "paddle/fluid/jit/function_schema.h" +#include "paddle/fluid/jit/function_utils.h" + +namespace paddle { +namespace jit { + +class ExecutorEngine : public BaseEngine { + public: + ExecutorEngine(const std::shared_ptr &info, + const VariableMap ¶ms_dict, + const phi::Place &place); + + ~ExecutorEngine() noexcept {} + + std::vector operator()(const std::vector &inputs); + + std::vector operator()(const std::vector &inputs); + + const std::shared_ptr &Info() const; + + private: + std::shared_ptr info_; + framework::Scope scope_; + phi::Place place_; + framework::Executor inner_exe_; +}; + +} // namespace jit +} // namespace paddle diff --git a/paddle/fluid/jit/engine/pe_engine.cc b/paddle/fluid/jit/engine/pe_engine.cc new file mode 100644 index 0000000000000..ddc2de0fc530e --- /dev/null +++ b/paddle/fluid/jit/engine/pe_engine.cc @@ -0,0 +1,118 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/jit/engine/pe_engine.h" + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/execution_strategy.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/parallel_executor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/phi/core/enforce.h" + +namespace paddle { +namespace jit { + +static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) { + ExecutionStrategy execution_strategy; + + auto device_type = platform::Place2DeviceType(place); + switch (device_type) { + case platform::DeviceType::CPU: { + execution_strategy.num_threads_ = 2; + break; + } + case platform::DeviceType::CUDA: { + // NOTE: According experiments, one thread is faster in + // most model training. + execution_strategy.num_threads_ = 1; + break; + } + case platform::DeviceType::XPU: { + execution_strategy.num_threads_ = 1; + break; + } + case platform::DeviceType::IPU: { + execution_strategy.num_threads_ = 1; + break; + } + default: + PADDLE_THROW(platform::errors::Unavailable("Unsupported Device type %d.", + device_type)); + } + execution_strategy.use_device_ = device_type; + + return execution_strategy; +} + +PEEngine::PEEngine(const std::shared_ptr &info, + const VariableMap ¶ms_dict, + const phi::Place &place) + : info_(info), place_(place) { + info_->RemoveDescFeedFetch(); + PADDLE_ENFORCE_GT( + static_cast(info_->ProgramDesc().Block(0).OpSize()), + 0, + platform::errors::PreconditionNotMet( + "There is no operator in ProgramDesc.")); + utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, &scope_); + VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_); + CreateGraphAndPE(); +} + +void PEEngine::CreateGraphAndPE() { + framework::details::BuildStrategy build_strategy; + auto execution_strategy = GetExecutionStrategy(place_); + + auto &program_desc = info_->ProgramDesc(); + const framework::BlockDesc &global_block = program_desc.Block(0); + int64_t start_op_index = 0; + int64_t end_op_index = static_cast(global_block.OpSize()); + + graph_ = std::make_shared(program_desc, start_op_index, end_op_index); + inner_pe_ = std::make_shared( + place_, &scope_, execution_strategy, build_strategy, graph_.get()); + inner_pe_->PrepareVariables(&scope_); + inner_pe_->SkipMemoryReuse(/*scope_idx=*/0, info_->InputArgNames()); +} + +std::vector PEEngine::operator()(const std::vector &inputs) { + auto dense_tensors = utils::ToDenseTensors(inputs); + return utils::ToTensors(this->operator()(dense_tensors)); +} + +std::vector PEEngine::operator()( + const std::vector &inputs) { + utils::ShareIntoScope(info_->InputArgNames(), inputs, &scope_); + + // update op_handle scope_map in pe->executor_->Graph + std::unordered_map scope_map = { + {inner_pe_->GetLocalScopes().front(), &scope_}}; + inner_pe_->ResetOpHandleScopeMapOfGraphs(scope_map); + // need to recreate tmp variables in new scope + inner_pe_->PrepareVariables(&scope_); + + inner_pe_->RunWithoutFetch(info_->OutputArgNames()); + + std::vector outputs; + utils::FetchOuts(info_->OutputArgNames(), scope_, &outputs); + scope_.DropKids(); + return outputs; +} + +const std::shared_ptr &PEEngine::Info() const { return info_; } + +} // namespace jit +} // namespace paddle diff --git a/paddle/fluid/jit/engine/pe_engine.h b/paddle/fluid/jit/engine/pe_engine.h new file mode 100644 index 0000000000000..16ade6d77d8ac --- /dev/null +++ b/paddle/fluid/jit/engine/pe_engine.h @@ -0,0 +1,67 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/scope.h" + +#include "paddle/fluid/jit/engine/base_engine.h" +#include "paddle/fluid/jit/function_schema.h" +#include "paddle/fluid/jit/function_utils.h" + +namespace paddle { + +namespace framework { +class ParallelExecutor; +namespace details { +class ExecutionStrategy; +} +namespace ir { +class Graph; +} +} // namespace framework + +namespace jit { +using ExecutionStrategy = framework::details::ExecutionStrategy; +using ParallelExecutor = framework::ParallelExecutor; +using Graph = framework::ir::Graph; + +class PEEngine : public BaseEngine { + public: + PEEngine(const std::shared_ptr &info, + const VariableMap ¶ms_dict, + const phi::Place &place); + + ~PEEngine() noexcept {} + + void CreateGraphAndPE(); + + std::vector operator()(const std::vector &inputs); + + std::vector operator()(const std::vector &inputs); + + const std::shared_ptr &Info() const; + + private: + std::shared_ptr info_; + framework::Scope scope_; + phi::Place place_; + std::shared_ptr inner_pe_; + std::shared_ptr graph_; +}; + +} // namespace jit +} // namespace paddle diff --git a/paddle/fluid/jit/executor_function.h b/paddle/fluid/jit/executor_function.h deleted file mode 100644 index 87a31a91949f3..0000000000000 --- a/paddle/fluid/jit/executor_function.h +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/phi/core/enforce.h" - -#include "paddle/fluid/jit/base_function.h" -#include "paddle/fluid/jit/function_schema.h" -#include "paddle/fluid/jit/function_utils.h" - -namespace paddle { -namespace jit { - -class ExecutorFunction : public BaseFunction { - public: - ExecutorFunction(const std::shared_ptr &info, - const Name2VariableMap ¶ms_dict, - const phi::Place &place) - : info_(info), place_(place), inner_exe_(place_) { - info_->RemoveDescFeedFetch(); - PADDLE_ENFORCE_GT( - static_cast(info_->ProgramDesc().Block(0).OpSize()), - 0, - platform::errors::PreconditionNotMet( - "There is no operator in ProgramDesc.")); - utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, &scope_); - VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_); - } - - ~ExecutorFunction() noexcept {} - - std::vector operator()(const std::vector &inputs) { - auto dense_tensors = utils::ToDenseTensors(inputs); - return utils::ToTensors(this->operator()(dense_tensors)); - } - - std::vector operator()(const std::vector &inputs) { - utils::ShareIntoScope(info_->InputArgNames(), inputs, &scope_); - inner_exe_.Run(info_->ProgramDesc(), - &scope_, - /*blockID=*/0, - false, - true, - info_->OutputArgNames()); - std::vector outputs; - utils::FetchOuts(info_->OutputArgNames(), scope_, &outputs); - return outputs; - } - - const std::shared_ptr &Info() const { return info_; } - - private: - std::shared_ptr info_; - framework::Scope scope_; - phi::Place place_; - framework::Executor inner_exe_; -}; - -} // namespace jit -} // namespace paddle diff --git a/paddle/fluid/jit/function.cc b/paddle/fluid/jit/function.cc new file mode 100644 index 0000000000000..0d297da500a49 --- /dev/null +++ b/paddle/fluid/jit/function.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/jit/function.h" + +#include +#include + +#include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/core/dense_tensor.h" + +#include "paddle/fluid/jit/engine/base_engine.h" +#include "paddle/fluid/jit/function_utils.h" + +namespace paddle { +namespace jit { + +Function::Function(BaseEngine* engine) : engine_(engine) {} + +std::vector Function::operator()( + const std::vector& inputs) const { + auto dense_tensors = utils::ToDenseTensors(inputs); + return utils::ToTensors(this->operator()(dense_tensors)); +} + +std::vector Function::operator()( + const std::vector& inputs) const { + return (*engine_)(inputs); +} + +} // namespace jit +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.h b/paddle/fluid/jit/function.h similarity index 54% rename from paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.h rename to paddle/fluid/jit/function.h index 60b7e98145698..daaecd55bfe67 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.h +++ b/paddle/fluid/jit/function.h @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,21 +15,30 @@ #pragma once #include +#include -#include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h" +#include "paddle/phi/api/include/tensor.h" namespace paddle { -namespace framework { -namespace ir { -class MatmulV2TransposeReshapeMKLDNNPass - : public MatmulTransposeReshapeMKLDNNPass { +namespace jit { +class BaseEngine; +using DenseTensor = phi::DenseTensor; +using Tensor = paddle::experimental::Tensor; + +class Function { public: - MatmulV2TransposeReshapeMKLDNNPass(); - virtual ~MatmulV2TransposeReshapeMKLDNNPass() {} + explicit Function(BaseEngine* engine); + + std::vector operator()(const std::vector& inputs) const; + + std::vector operator()( + const std::vector& inputs) const; - protected: - const std::string name_scope_{"matmul_v2_transpose_reshape_fuse"}; + ~Function() = default; + + private: + BaseEngine* engine_; }; -} // namespace ir -} // namespace framework + +} // namespace jit } // namespace paddle diff --git a/paddle/fluid/jit/function_utils.cc b/paddle/fluid/jit/function_utils.cc index 83da12d2652a3..b67b5ba5b0518 100644 --- a/paddle/fluid/jit/function_utils.cc +++ b/paddle/fluid/jit/function_utils.cc @@ -71,14 +71,19 @@ void ShareIntoScope(const std::vector &ordered_input_names, } void ShareParamsIntoScope(const std::vector ¶m_names, - const Name2VariableMap ¶ms_dict, + const VariableMap ¶ms_dict, framework::Scope *scope) { - VLOG(3) << "param_names size: " << param_names.size(); for (size_t i = 0; i < param_names.size(); ++i) { std::string name = param_names[i]; + PADDLE_ENFORCE_EQ(params_dict.count(name), + 1, + phi::errors::InvalidArgument( + "Parameter named %s is not existed in params_dict. " + "Please check that your model was saved correctly", + name)); + auto ¶m = params_dict.find(name)->second; auto &dense_tensor = param->Get(); - VLOG(3) << "share into scope: " << name; auto *var = scope->Var(name); auto *dst_tensor = var->GetMutable(); *dst_tensor = dense_tensor; diff --git a/paddle/fluid/jit/function_utils.h b/paddle/fluid/jit/function_utils.h index 90e2e4b7f798f..d61b720cec88f 100644 --- a/paddle/fluid/jit/function_utils.h +++ b/paddle/fluid/jit/function_utils.h @@ -33,8 +33,7 @@ class Scope; namespace jit { using Variable = paddle::framework::Variable; -using Name2VariableMap = - std::unordered_map>; +using VariableMap = std::unordered_map>; using DenseTensor = phi::DenseTensor; using Tensor = paddle::experimental::Tensor; @@ -52,15 +51,15 @@ void ShareIntoScope(const std::vector &ordered_input_names, framework::Scope *scope); void ShareParamsIntoScope(const std::vector ¶m_names, - const Name2VariableMap ¶ms_dict, + const VariableMap ¶ms_dict, framework::Scope *scope); void RemoveFeedFetch(framework::ProgramDesc *program_desc); template -std::shared_ptr MakeFunction(const std::shared_ptr &info, - const Name2VariableMap ¶ms_dict, - const phi::Place &place) { +std::shared_ptr MakeEngine(const std::shared_ptr &info, + const VariableMap ¶ms_dict, + const phi::Place &place) { return std::make_shared(info, params_dict, place); } diff --git a/paddle/fluid/jit/layer.cc b/paddle/fluid/jit/layer.cc index a80b05e45cfbd..9055120e4bbb7 100644 --- a/paddle/fluid/jit/layer.cc +++ b/paddle/fluid/jit/layer.cc @@ -15,62 +15,74 @@ #include "paddle/fluid/jit/layer.h" #include "paddle/fluid/framework/variable.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/errors.h" -#include "paddle/fluid/jit/base_function.h" #include "paddle/fluid/jit/compilation_unit.h" +#include "paddle/fluid/jit/engine/base_engine.h" +#include "paddle/fluid/jit/function.h" #include "paddle/fluid/jit/function_schema.h" -#include "paddle/phi/core/enforce.h" -#include "paddle/phi/core/errors.h" namespace paddle { namespace jit { -Layer::Layer(const Name2VariableMap& params_dict, - const Name2VariableMap& attrs_dict, +Layer::Layer(const VariableMap& params_map, + const VariableMap& attrs_map, + const FunctionInfoMap& info_map, const phi::Place& place) - : params_dict_(params_dict), attrs_dict_(attrs_dict) { + : params_map_(params_map), attrs_map_(attrs_map), info_map_(info_map) { unit_.reset(new CompilationUnit()); } -std::shared_ptr Layer::Function(const std::string& name) const { - return unit_->Function(name); +jit::Function Layer::Function(const std::string& name) const { + return jit::Function(unit_->GetEngine(name).get()); } std::vector Layer::forward(const std::vector& inputs) { - auto func = Function("forward"); - return (*func)(inputs); + auto func = this->Function("forward"); + return func(inputs); } std::vector Layer::forward( const std::vector& inputs) { - auto func = Function("forward"); - return (*func)(inputs); + auto func = this->Function("forward"); + return func(inputs); } void Layer::to(const phi::Place& place) {} -void Layer::SetFunction(const std::string& name, - const std::shared_ptr& function) { - unit_->SetFunction(name, function); +void Layer::SetEngine(const std::string& name, + const std::shared_ptr& engine) { + unit_->SetEngine(name, engine); } -std::vector Layer::FunctionNames() const { - return unit_->FunctionNames(); +const std::shared_ptr& Layer::FunctionInfo( + const std::string& name) const { + PADDLE_ENFORCE_EQ( + info_map_.count(name), + 1, + phi::errors::InvalidArgument( + "FuncitonInfo named %s is not existed in info_map_.", name)); + return info_map_.at(name); } -const Name2FunctionMap& Layer::FunctionMap() const { - return unit_->FunctionMap(); +std::vector Layer::FunctionNames() const { + std::vector names; + for (auto it = info_map_.begin(); it != info_map_.end(); ++it) { + names.emplace_back(it->first); + } + return names; } #define PD_SPECIALZE_ATTRIBUTE_TYPE(T) \ template <> \ T Layer::Attribute(const std::string& name) const { \ - if (attrs_dict_.find(name) == attrs_dict_.end()) { \ + if (attrs_map_.find(name) == attrs_map_.end()) { \ PADDLE_THROW(phi::errors::NotFound( \ "Attribute can not found %s, please check if it exists.")); \ return T(); \ } \ - auto var = attrs_dict_.at(name); \ + auto var = attrs_map_.at(name); \ T ret = var->Get(); \ return ret; \ } diff --git a/paddle/fluid/jit/layer.h b/paddle/fluid/jit/layer.h index 4c6c714d37b45..dd5ff5d9f91cd 100644 --- a/paddle/fluid/jit/layer.h +++ b/paddle/fluid/jit/layer.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -21,7 +22,7 @@ #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/common/place.h" -#include "base_function.h" //NOLINT +#include "function.h" //NOLINT namespace paddle { @@ -31,22 +32,23 @@ class Variable; namespace jit { class CompilationUnit; +class FunctionInfo; using DenseTensor = phi::DenseTensor; using Tensor = paddle::experimental::Tensor; using Variable = paddle::framework::Variable; -using Name2VariableMap = - std::unordered_map>; -using Name2FunctionMap = - std::unordered_map>; +using VariableMap = std::unordered_map>; +using FunctionInfoMap = + std::unordered_map>; class Layer { public: - Layer(const Name2VariableMap& params_dict, - const Name2VariableMap& attrs_dict_, + Layer(const VariableMap& params_map, + const VariableMap& attrs_map_, + const FunctionInfoMap& info_map, const phi::Place& place); - std::shared_ptr Function(const std::string& name) const; + jit::Function Function(const std::string& name) const; template T Attribute(const std::string& name) const; @@ -57,16 +59,18 @@ class Layer { void to(const phi::Place& place); - void SetFunction(const std::string& name, - const std::shared_ptr& function); + void SetEngine(const std::string& name, + const std::shared_ptr& engine); - std::vector FunctionNames() const; + const std::shared_ptr& FunctionInfo( + const std::string& name) const; - const Name2FunctionMap& FunctionMap() const; + std::vector FunctionNames() const; private: - Name2VariableMap params_dict_; - Name2VariableMap attrs_dict_; + VariableMap params_map_; + VariableMap attrs_map_; + FunctionInfoMap info_map_; std::shared_ptr unit_; }; diff --git a/paddle/fluid/jit/layer_test.cc b/paddle/fluid/jit/layer_test.cc index 1579610c7a63b..b54ea3c4aa132 100644 --- a/paddle/fluid/jit/layer_test.cc +++ b/paddle/fluid/jit/layer_test.cc @@ -26,6 +26,7 @@ #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/fluid/jit/function.h" #include "paddle/fluid/jit/function_utils.h" #include "paddle/fluid/jit/layer.h" #include "paddle/fluid/jit/serializer.h" @@ -102,7 +103,7 @@ TEST(CpuLayerTest, Construct) { EXPECT_NEAR(out_data[0], 0.02194316, 1e-6); auto func = layer.Function("infer"); - outs = (*func)(inputs); + outs = func(inputs); out_data = outs[0].data(); EXPECT_NEAR(out_data[0], 1.41562390, 1e-6); auto pow_out = @@ -127,7 +128,7 @@ TEST(GpuLayerTest, Construct) { EXPECT_NEAR(out_data[0], 0.02194316, 1e-6); auto func = layer.Function("infer"); - outs = (*func)(inputs); + outs = func(inputs); gpu_tensor = outs[0]; cpu_tensor = paddle::experimental::copy_to(gpu_tensor, phi::CPUPlace(), true); out_data = cpu_tensor.data(); diff --git a/paddle/fluid/jit/object.h b/paddle/fluid/jit/object.h deleted file mode 100644 index 94aae67376007..0000000000000 --- a/paddle/fluid/jit/object.h +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/variable.h" - -namespace paddle { -namespace jit { -class ClassType; - -namespace internal { - -class Object { - public: - Object(const std::shared_ptr& type, size_t num_slot) - : type_(type) { - slots_.resize(num_slot); - } - - static std::unique_ptr Create(std::shared_ptr type, - size_t num_slot) { - return std::make_unique(type, num_slot); - } - - std::shared_ptr Type() const { return type_; } - - void SetSlot(size_t slot, Variable val) { - if (slot >= slots_.size()) { - slots_.resize(slot); - } - slots_[slot] = std::move(val); - } - - const Variable& GetSlot(size_t slot) { - // TODO(dev): Add ENFORCE_LT(slot, size()); - return slots_[slot]; - } - - Variable GetAttr(const std::string& name) const; - - void SetAttr(const std::string& name, Variable val); - - private: - std::shared_ptr type_; - // Store Tensors and Attributes - std::vector slots_; -}; - -} // namespace internal -} // namespace jit -} // namespace paddle diff --git a/paddle/fluid/jit/pe_function.h b/paddle/fluid/jit/pe_function.h deleted file mode 100644 index 809ad5ecbe662..0000000000000 --- a/paddle/fluid/jit/pe_function.h +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/details/build_strategy.h" -#include "paddle/fluid/framework/details/execution_strategy.h" -#include "paddle/fluid/framework/executor_cache.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/phi/core/enforce.h" - -#include "paddle/fluid/jit/base_function.h" -#include "paddle/fluid/jit/function_schema.h" -#include "paddle/fluid/jit/function_utils.h" - -namespace paddle { -namespace jit { - -using ExecutionStrategy = framework::details::ExecutionStrategy; -using ParallelExecutor = framework::ParallelExecutor; -using Graph = framework::ir::Graph; - -class PEFunction : public BaseFunction { - public: - PEFunction(const std::shared_ptr &info, - const Name2VariableMap ¶ms_dict, - const phi::Place &place) - : info_(info), place_(place) { - info_->RemoveDescFeedFetch(); - PADDLE_ENFORCE_GT( - static_cast(info_->ProgramDesc().Block(0).OpSize()), - 0, - platform::errors::PreconditionNotMet( - "There is no operator in ProgramDesc.")); - utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, &scope_); - VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_); - CreateGraphAndPE(); - } - - ~PEFunction() noexcept {} - - static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) { - ExecutionStrategy execution_strategy; - - auto device_type = platform::Place2DeviceType(place); - switch (device_type) { - case platform::DeviceType::CPU: { - execution_strategy.num_threads_ = 2; - break; - } - case platform::DeviceType::CUDA: { - // NOTE: According experiments, one thread is faster in - // most model training. - execution_strategy.num_threads_ = 1; - break; - } - case platform::DeviceType::XPU: { - execution_strategy.num_threads_ = 1; - break; - } - case platform::DeviceType::IPU: { - execution_strategy.num_threads_ = 1; - break; - } - default: - PADDLE_THROW(platform::errors::Unavailable( - "Unsupported Device type %d.", device_type)); - } - execution_strategy.use_device_ = device_type; - - return execution_strategy; - } - - void CreateGraphAndPE() { - framework::details::BuildStrategy build_strategy; - auto execution_strategy = GetExecutionStrategy(place_); - - auto &program_desc = info_->ProgramDesc(); - const framework::BlockDesc &global_block = program_desc.Block(0); - int64_t start_op_index = 0; - int64_t end_op_index = static_cast(global_block.OpSize()); - - graph_ = - std::make_shared(program_desc, start_op_index, end_op_index); - inner_pe_ = std::make_shared( - place_, &scope_, execution_strategy, build_strategy, graph_.get()); - inner_pe_->PrepareVariables(&scope_); - inner_pe_->SkipMemoryReuse(/*scope_idx=*/0, info_->InputArgNames()); - } - - std::vector operator()(const std::vector &inputs) { - auto dense_tensors = utils::ToDenseTensors(inputs); - return utils::ToTensors(this->operator()(dense_tensors)); - } - - std::vector operator()(const std::vector &inputs) { - utils::ShareIntoScope(info_->InputArgNames(), inputs, &scope_); - - // update op_handle scope_map in pe->executor_->Graph - std::unordered_map scope_map = { - {inner_pe_->GetLocalScopes().front(), &scope_}}; - inner_pe_->ResetOpHandleScopeMapOfGraphs(scope_map); - // need to recreate tmp variables in new scope - inner_pe_->PrepareVariables(&scope_); - - inner_pe_->RunWithoutFetch(info_->OutputArgNames()); - - std::vector outputs; - utils::FetchOuts(info_->OutputArgNames(), scope_, &outputs); - scope_.DropKids(); - return outputs; - } - - const std::shared_ptr &Info() const { return info_; } - - private: - std::shared_ptr info_; - framework::Scope scope_; - phi::Place place_; - std::shared_ptr inner_pe_; - std::shared_ptr graph_; -}; - -} // namespace jit -} // namespace paddle diff --git a/paddle/fluid/jit/serializer.cc b/paddle/fluid/jit/serializer.cc index a9bd5676ad261..65a39bc7f9a56 100644 --- a/paddle/fluid/jit/serializer.cc +++ b/paddle/fluid/jit/serializer.cc @@ -20,9 +20,9 @@ #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/jit/executor_function.h" +#include "paddle/fluid/jit/engine/executor_engine.h" +#include "paddle/fluid/jit/engine/pe_engine.h" #include "paddle/fluid/jit/layer.h" -#include "paddle/fluid/jit/pe_function.h" #include "paddle/fluid/jit/property.h" #include "paddle/fluid/jit/serializer_utils.h" @@ -30,18 +30,18 @@ DECLARE_string(jit_engine_type); namespace paddle { namespace jit { - +using FunctionInfoMap = + std::unordered_map>; Layer Deserializer::operator()(const std::string& path, const phi::Place& place) { const auto& pdmodel_paths = utils::PdmodelFilePaths(path); // set is ordered std::set param_names_set; - std::vector> infos; + FunctionInfoMap info_map; for (auto& it : pdmodel_paths) { auto& func_name = it.first; auto program_desc = LoadProgram(it.second); - // TODO(dev): load int/float attrs std::vector persist_var_names; auto all_var_desc = program_desc.Block(0).AllVars(); for (auto* desc_ptr : all_var_desc) { @@ -51,12 +51,12 @@ Layer Deserializer::operator()(const std::string& path, } param_names_set.insert(persist_var_names.begin(), persist_var_names.end()); - infos.emplace_back(std::make_shared( - func_name, persist_var_names, program_desc)); + info_map[func_name] = std::make_shared( + func_name, persist_var_names, program_desc); } - Name2VariableMap params_dict; - Name2VariableMap attrs_dict; + VariableMap params_dict; + VariableMap attrs_dict; ReadTensorData(path + PDPARAMS_SUFFIX, param_names_set, place, ¶ms_dict); if (utils::FileExists(path + PROPERTY_SUFFIX)) { @@ -64,23 +64,23 @@ Layer Deserializer::operator()(const std::string& path, VLOG(3) << "Read Property Success!"; } - Layer layer = Layer(params_dict, attrs_dict, place); + Layer layer = Layer(params_dict, attrs_dict, info_map, place); - for (auto& info : infos) { + for (auto it = info_map.begin(); it != info_map.end(); ++it) { + const std::string& func_name = it->first; + auto& info = it->second; if (FLAGS_jit_engine_type == "Executor") { - VLOG(3) << "Add function type: ExecutorFunction. name: " - << info->FunctionName(); - layer.SetFunction( - info->FunctionName(), - utils::MakeFunction(info, params_dict, place)); + VLOG(3) << "Add function type: ExecutorEngine. Function name: " + << func_name; + layer.SetEngine( + func_name, + utils::MakeEngine(info, params_dict, place)); } else if (FLAGS_jit_engine_type == "PE") { - VLOG(3) << "Add function type: PEFunction. name: " - << info->FunctionName(); - layer.SetFunction( - info->FunctionName(), - utils::MakeFunction(info, params_dict, place)); + VLOG(3) << "Add function type: PEEngine. Function name: " << func_name; + layer.SetEngine(func_name, + utils::MakeEngine(info, params_dict, place)); } else { - PD_THROW("Invalid JitLayer funciton type."); + PD_THROW("Invalid JitLayer engine type."); } } @@ -90,7 +90,7 @@ Layer Deserializer::operator()(const std::string& path, void Deserializer::ReadTensorData(const std::string& file_name, const std::set& var_name, const phi::Place& place, - Name2VariableMap* params_dict) const { + VariableMap* params_dict) const { VLOG(3) << "ReadTensorData from: " << file_name; std::ifstream fin(file_name, std::ios::binary); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -106,11 +106,11 @@ void Deserializer::ReadTensorData(const std::string& file_name, } void Deserializer::ReadAttributeData(const std::string& file_path, - Name2VariableMap* attrs_dict) const { + VariableMap* attrs_dict) const { VLOG(3) << "ReadPropertyData from: " << file_path; Property p; p.Deserialization(file_path); - *attrs_dict = static_cast(p.Values()); + *attrs_dict = static_cast(p.Values()); return; } diff --git a/paddle/fluid/jit/serializer.h b/paddle/fluid/jit/serializer.h index 188239f469a57..b93eaa44fe632 100644 --- a/paddle/fluid/jit/serializer.h +++ b/paddle/fluid/jit/serializer.h @@ -31,8 +31,7 @@ class ProgramDesc; namespace jit { class Layer; using Variable = paddle::framework::Variable; -using Name2VariableMap = - std::unordered_map>; +using VariableMap = std::unordered_map>; // Export Layer into local disk class Serializer { @@ -56,11 +55,11 @@ class Deserializer { void ReadTensorData(const std::string& file_name, const std::set& var_name, const phi::Place& place, - Name2VariableMap* params_dict) const; + VariableMap* params_dict) const; // property pb void ReadAttributeData(const std::string& file_path, - Name2VariableMap* attrs_dict) const; + VariableMap* attrs_dict) const; // void ReadExtraInfo(const std::string& file_name) const; diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 893f7d51140a7..a25c54bae2a43 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -104,7 +104,7 @@ endif() set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel backward_infermeta) register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op quantize_linear_op - recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) + recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS}) target_link_libraries(run_program_op cuda_graph_with_memory_pool) @@ -129,22 +129,6 @@ else() op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) endif() -if (WITH_GPU OR WITH_ROCM) - if (MKL_FOUND AND WITH_ONEMKL) - op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda dynload_mklrt ${OP_HEADER_DEPS}) - target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE}) - else() - op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda ${OP_HEADER_DEPS}) - endif() -else() - if (MKL_FOUND AND WITH_ONEMKL) - op_library(spectral_op SRCS spectral_op.cc DEPS dynload_mklrt ${OP_HEADER_DEPS}) - target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE}) - else() - op_library(spectral_op SRCS spectral_op.cc DEPS ${OP_HEADER_DEPS}) - endif() -endif() - if (WITH_ASCEND_CL) op_library(sync_batch_norm_op) endif() diff --git a/paddle/fluid/operators/activation_op_mlu.cc b/paddle/fluid/operators/activation_op_mlu.cc index 72e0e9ceacf48..6cfe4738d777b 100644 --- a/paddle/fluid/operators/activation_op_mlu.cc +++ b/paddle/fluid/operators/activation_op_mlu.cc @@ -370,7 +370,7 @@ class HardSigmoidGradMLUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* out = ctx.Input("Out"); + auto* x = ctx.Input("X"); auto* dx = ctx.Output(framework::GradVarName("X")); float slope = ctx.Attr("slope"); float offset = ctx.Attr("offset"); @@ -381,7 +381,7 @@ class HardSigmoidGradMLUKernel : public framework::OpKernel { 1.0f /*sliced_dim useless*/, slope, offset); - MLUCnnlTensorDesc out_desc(*out); + MLUCnnlTensorDesc x_desc(*x); MLUCnnlTensorDesc dout_desc(*dout); MLUCnnlTensorDesc dx_desc(*dx); MLUCnnl::ActiveGrad(ctx, @@ -392,8 +392,8 @@ class HardSigmoidGradMLUKernel : public framework::OpKernel { nullptr, dout_desc.get(), GetBasePtr(dout), - out_desc.get(), - GetBasePtr(out), + x_desc.get(), + GetBasePtr(x), dx_desc.get(), GetBasePtr(dx)); } diff --git a/paddle/fluid/operators/batch_norm_op_mlu.cc b/paddle/fluid/operators/batch_norm_op_mlu.cc index 199a9b95ec3cb..1aa445bda3717 100644 --- a/paddle/fluid/operators/batch_norm_op_mlu.cc +++ b/paddle/fluid/operators/batch_norm_op_mlu.cc @@ -273,7 +273,7 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel { const auto *running_mean = ctx.Input("Mean"); const auto *running_variance = ctx.Input("Variance"); MLUCnnl::FusedBatchNormGrad(ctx, - true /*is_training*/, + false /*is_training*/, transformed_desc.get(), GetBasePtr(&transformed_d_y), transformed_desc.get(), diff --git a/paddle/fluid/operators/class_center_sample_op.cc b/paddle/fluid/operators/class_center_sample_op.cc index d870ad28a0368..57f8bfb71fb1f 100644 --- a/paddle/fluid/operators/class_center_sample_op.cc +++ b/paddle/fluid/operators/class_center_sample_op.cc @@ -12,7 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/class_center_sample_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,30 +24,6 @@ namespace operators { class ClassCenterSampleOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK( - ctx->HasInput("Label"), "Input", "Label", "ClassCenterSample"); - OP_INOUT_CHECK(ctx->HasOutput("RemappedLabel"), - "Output", - "RemappedLabel", - "ClassCenterSample"); - OP_INOUT_CHECK(ctx->HasOutput("SampledLocalClassCenter"), - "Output", - "SampledLocalClassCenter", - "ClassCenterSample"); - - auto x_dims = ctx->GetInputDim("Label"); - PADDLE_ENFORCE_EQ(x_dims.size(), - 1, - platform::errors::InvalidArgument( - "Rank of Input(Label) should be equal to 1, " - "but the value given is %d.", - x_dims.size())); - - ctx->SetOutputDim("RemappedLabel", x_dims); - auto num_samples = ctx->Attrs().Get("num_samples"); - ctx->SetOutputDim("SampledLocalClassCenter", phi::make_ddim({num_samples})); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -143,10 +123,10 @@ class ClassCenterSampleOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; -namespace plat = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(class_center_sample, + ClassCenterSampleInferShapeFunctor, + PD_INFER_META(phi::ClassCenterSampleInferMeta)); REGISTER_OP_WITHOUT_GRADIENT(class_center_sample, ops::ClassCenterSampleOp, - ops::ClassCenterSampleOpMaker); -REGISTER_OP_CPU_KERNEL(class_center_sample, - ops::ClassCenterSampleCPUKernel, - ops::ClassCenterSampleCPUKernel); + ops::ClassCenterSampleOpMaker, + ClassCenterSampleInferShapeFunctor); diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu deleted file mode 100644 index b92062b1aee24..0000000000000 --- a/paddle/fluid/operators/class_center_sample_op.cu +++ /dev/null @@ -1,611 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifdef PADDLE_WITH_HIP -#include -#include - -#include -typedef hiprandState curandState; -namespace cub = hipcub; -#else -#include -#include - -#include -#endif - -#include -#include - -#include "paddle/fluid/operators/class_center_sample_op.h" -#include "paddle/phi/api/include/tensor.h" - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/distributed/collective/ProcessGroup.h" -#include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/device/gpu/nccl_helper.h" -#endif - -namespace paddle { -namespace operators { -#define CUDA_KERNEL_LOOP(i, n) \ - for (int32_t i = blockIdx.x * blockDim.x + threadIdx.x, \ - step = blockDim.x * gridDim.x; \ - i < (n); \ - i += step) - -using Tensor = framework::Tensor; - -static constexpr int kNumCUDAThreads = 512; -static constexpr int kNumMaxinumNumBlocks = 4096; - -inline int32_t NumBlocks(const int32_t n) { - return std::min((n + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaxinumNumBlocks); -} - -template -__global__ void RandomSampleClassCenter(const int64_t n, - int64_t seed, - int64_t increment, - const int64_t max_val, - T* buffer) { - const int id = blockIdx.x * blockDim.x + threadIdx.x; - curandState localState; - size_t local_seed = - (static_cast(seed) + 0x9E3779B9U + - (static_cast(id) << 6U) + (static_cast(id) >> 2U)); -#ifdef PADDLE_WITH_HIP - hiprand_init(local_seed, id, increment, &localState); - CUDA_KERNEL_LOOP(i, n) { - buffer[i] = static_cast(hiprand(&localState) % max_val); - } -#else - curand_init(local_seed, id, increment, &localState); - CUDA_KERNEL_LOOP(i, n) { - buffer[i] = static_cast(curand(&localState) % max_val); - } -#endif -} - -template -__global__ void Range(const int64_t n, T* out) { - CUDA_KERNEL_LOOP(i, n) { out[i] = static_cast(i); } -} - -template -__global__ void MarkPositiveClassCenter(const int64_t n, - const int64_t rank, - const T* class_interval_ptr, - const int num_classes, - const T* labels, - T* out) { - CUDA_KERNEL_LOOP(i, n) { - T label = labels[i] - class_interval_ptr[rank]; - if (label >= 0 && label < num_classes) { - out[label] = label - num_classes; - } - } -} - -template -__device__ void FindIntervalIndex(const T* class_interval_ptr, - const int64_t nranks, - const T value, - int64_t* find_index) { - int64_t start = 0; - int64_t end = nranks; - int64_t mid = ((end - start) >> 1) + start + 1; - while (start < end) { - if (class_interval_ptr[mid] == value) break; - if (class_interval_ptr[mid] > value) - end = mid - 1; - else - start = mid; - mid = ((end - start) >> 1) + start + 1; - } - *find_index = min(mid, end); -} - -template -__global__ void GetClassCenterBound(const int64_t n, - const int64_t nranks, - const T* class_interval_ptr, - const T* key_ptr, - const T* value_ptr, - T* bound_index, - T* bound_value) { - CUDA_KERNEL_LOOP(i, n) { - if (i != 0) { - int64_t cur_index, pre_index; - FindIntervalIndex(class_interval_ptr, nranks, key_ptr[i], &cur_index); - FindIntervalIndex(class_interval_ptr, nranks, key_ptr[i - 1], &pre_index); - if (cur_index > pre_index) { - assert(cur_index < nranks); -#pragma unroll - for (int32_t j = pre_index + 1; j <= cur_index; ++j) { - bound_index[j] = static_cast(i); - bound_value[j] = value_ptr[i]; - } - } - } - } - CUDA_KERNEL_LOOP(i, nranks + 1) { - int64_t first_index, last_index; - FindIntervalIndex(class_interval_ptr, nranks, key_ptr[0], &first_index); - FindIntervalIndex(class_interval_ptr, nranks, key_ptr[n - 1], &last_index); - if (i <= first_index) { - bound_index[i] = 0; - bound_value[i] = value_ptr[0]; - } else if (i > last_index) { - bound_index[i] = n; - bound_value[i] = value_ptr[n - 1] + 1; - } - } -} - -template -__global__ void GetRemappedLabel(const int64_t n, - const int64_t nranks, - const T* sampled_class_interval_ptr, - const T* bound_index, - const T* bound_value, - const T* label_map_key, - T* label_map_value, - T* mapped_label) { - CUDA_KERNEL_LOOP(i, n) { -#pragma unroll - for (int64_t j = 0; j < nranks; j++) { - if (i >= bound_index[j] && i < bound_index[j + 1]) { - label_map_value[i] = - label_map_value[i] - bound_value[j] + sampled_class_interval_ptr[j]; - } - } - mapped_label[label_map_key[i]] = label_map_value[i]; - } -} - -// aligned vector generates vectorized load/store on CUDA -template -struct alignas(sizeof(T) * Size) AlignedVector { - T val[Size]; -}; - -template -inline int VectorizedSize(const T* pointer) { - uint64_t address = reinterpret_cast(pointer); - constexpr int vec4 = std::alignment_of>::value; // NOLINT - if (address % vec4 == 0) { - return 4; - } - return 1; -} - -#undef CUDA_KERNEL_LOOP - -template -class NotEqualToPreviousAdjacentIterator { - public: - using self_type = NotEqualToPreviousAdjacentIterator; - using value_type = T; - using difference_type = std::ptrdiff_t; - using pointer = T*; - using reference = T; - using iterator_category = std::input_iterator_tag; - - public: - __host__ __device__ __forceinline__ - NotEqualToPreviousAdjacentIterator(const T* arr, int64_t offset) - : arr_(arr), offset_(offset) {} - - __host__ __device__ __forceinline__ reference operator*() const { - return offset_ == 0 ? 0 : (arr_[offset_] == arr_[offset_ - 1] ? 0 : 1); - } - - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const { - self_type ret(arr_, offset_ + n); - return ret; - } - - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const { - self_type ret(arr_, offset_ - n); - return ret; - } - - template - __host__ __device__ __forceinline__ reference operator[](Distance n) const { - return *(*this + n); - } - - private: - const T* arr_; - int64_t offset_; -}; - -template -struct ActualNumSampledFunctor { - __host__ __device__ __forceinline__ T operator()(const T& a, - const T& b) const { - return max(num_samples, (b - a)); - } - T num_samples; - explicit ActualNumSampledFunctor(const T num) : num_samples(num) {} -}; - -template -class MemoryBuffer { - public: - MemoryBuffer(const int num_buffer_ele, - const int num_temp_ele, - const int nranks, - const platform::Place& place) { - offset1 = 0; - offset2 = offset1 + num_buffer_ele; - offset3 = offset2 + num_buffer_ele; - offset4 = offset3 + num_buffer_ele; - offset5 = offset4 + num_buffer_ele; - offset6 = offset5 + (nranks + 1); - offset7 = offset6 + (nranks + 1); - offset8 = offset7 + (nranks + 1); - offset9 = offset8 + num_temp_ele; - - buffer_ptr = buffer.mutable_data( - {4 * num_buffer_ele + 3 * (nranks + 1) + num_temp_ele}, place); - } - - T* cub_sort_keys_ptr() { return buffer_ptr + offset1; } - T* cub_sort_keys_out_ptr() { return buffer_ptr + offset2; } - T* cub_sort_values_ptr() { return buffer_ptr + offset3; } - T* cub_sort_values_out_ptr() { return buffer_ptr + offset4; } - T* bound_index_ptr() { return buffer_ptr + offset5; } - T* bound_value_ptr() { return buffer_ptr + offset6; } - T* class_interval_ptr() { return buffer_ptr + offset7; } - void* cub_temp_storage_ptr() { - return reinterpret_cast(buffer_ptr + offset8); - } - - private: - Tensor buffer; - T* buffer_ptr; - int offset1; - int offset2; - int offset3; - int offset4; - int offset5; - int offset6; - int offset7; - int offset8; - int offset9; -}; - -template -class ClassCenterSampleCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* label = ctx.Input("Label"); - auto* remapped_label = ctx.Output("RemappedLabel"); - auto* sampled_local_class_center = - ctx.Output("SampledLocalClassCenter"); - int num_classes = ctx.Attr("num_classes"); - int num_samples = ctx.Attr("num_samples"); - - int rid = ctx.Attr("ring_id"); - int nranks = ctx.Attr("nranks"); - int rank = ctx.Attr("rank"); - - int seed = ctx.Attr("seed"); - bool fix_seed = ctx.Attr("fix_seed"); - PADDLE_ENFORCE_GT(num_classes, - 0, - platform::errors::InvalidArgument( - "The value 'num_classes' for Op(class_center_sample) " - "must be greater than 0, " - "but the value given is %d.", - num_classes)); - - PADDLE_ENFORCE_GT(num_samples, - 0, - platform::errors::InvalidArgument( - "The value 'num_samples' for Op(class_center_sample) " - "must be greater than 0, " - "but the value given is %d.", - num_samples)); - - PADDLE_ENFORCE_LE(num_samples, - num_classes, - platform::errors::InvalidArgument( - "The value 'num_samples' for Op(class_center_sample) " - "must be less than or equal to %d, " - "but the value given is %d.", - num_classes, - num_samples)); - - auto& dev_ctx = ctx.template device_context(); - auto place = dev_ctx.GetPlace(); - - int batch_size = label->numel(); - // Algorithm: - // We first randomly generate a value in [0, num_classes) on each position - // in a array(shape[num_classes]). Then, we mark the element as negative - // value in the array according input label. Now, we can sort the array - // by ascending to ensure that the positive class center always in the - // front of the sorted array. So, we can get the sampled class center - // index by sorted keys. Finally, we can get the rempped label by remap - // the input label according sampled class center. - - // step 1: Calculate num classes per device using nccl all reduce - std::vector shard_dim_vec(nranks + 1, 0); - shard_dim_vec[rank + 1] = num_classes; - Tensor num_classes_per_device; - framework::TensorFromVector( - shard_dim_vec, ctx.cuda_device_context(), &num_classes_per_device); - T* num_classes_per_device_ptr = num_classes_per_device.data(); - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - if (nranks > 1) { - auto map = distributed::ProcessGroupMapFromGid::getInstance(); - if (map->has(rid)) { - // Use ProcessGroup - distributed::ProcessGroup* pg = map->get(rid); - std::vector in_tensor; - std::vector out_tensor; - in_tensor.push_back(num_classes_per_device); - out_tensor.push_back(num_classes_per_device); - - distributed::AllreduceOptions opts; - opts.reduce_op = distributed::ReduceOp::SUM; - auto task = pg->AllReduce(in_tensor, out_tensor, opts); - task->Wait(); - } else { - const auto& comm = - platform::NCCLCommContext::Instance().Get(rid, ctx.GetPlace()); - // use global calculate stream - const auto calcu_stream = - static_cast( - platform::DeviceContextPool::Instance().Get(ctx.GetPlace())) - ->stream(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - num_classes_per_device_ptr, - num_classes_per_device_ptr, - num_classes_per_device.numel(), - platform::ToNCCLDataType( - framework::TransToProtoVarType(num_classes_per_device.dtype())), - ncclSum, - comm->comm(), - calcu_stream)); - } - } -#endif - - // step 2: Determine temporary device storage requirements - int num_buffer_ele = std::max(batch_size, num_classes); - size_t cub_sort_temp_store_size = 0; - PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceRadixSort::SortPairs( - nullptr, - cub_sort_temp_store_size, - nullptr, - nullptr, - nullptr, - nullptr, - num_buffer_ele, - 0, - sizeof(T) * 8, - ctx.cuda_device_context().stream()))); - - size_t cub_sum_temp_store_size = 0; - NotEqualToPreviousAdjacentIterator unique_counting_iter_temp(nullptr, 0); - PADDLE_ENFORCE_GPU_SUCCESS(( - cub::DeviceScan::InclusiveSum, - T*>(nullptr, - cub_sum_temp_store_size, - unique_counting_iter_temp, - nullptr, - batch_size, - ctx.cuda_device_context().stream()))); - - size_t cub_scan_temp_store_size = 0; - ActualNumSampledFunctor actual_num_sampled_op_temp(num_samples); - PADDLE_ENFORCE_GPU_SUCCESS( - (cub::DeviceScan::InclusiveScan(nullptr, - cub_scan_temp_store_size, - num_classes_per_device_ptr, - num_classes_per_device_ptr, - actual_num_sampled_op_temp, - nranks + 1, - ctx.cuda_device_context().stream()))); - - size_t cub_temp_storage_bytes = - std::max(std::max(cub_sort_temp_store_size, cub_scan_temp_store_size), - cub_sum_temp_store_size); - int num_temp_ele = cub_temp_storage_bytes / sizeof(T) + 1; - - // step 3: Alloc buffer memory so that we can reuse allocated memory - MemoryBuffer memory_buffer = - MemoryBuffer(num_buffer_ele, num_temp_ele, nranks, ctx.GetPlace()); - - T* cub_sort_keys_ptr = memory_buffer.cub_sort_keys_ptr(); - T* cub_sort_keys_out_ptr = memory_buffer.cub_sort_keys_out_ptr(); - T* cub_sort_values_ptr = memory_buffer.cub_sort_values_ptr(); - T* cub_sort_values_out_ptr = memory_buffer.cub_sort_values_out_ptr(); - T* bound_index_ptr = memory_buffer.bound_index_ptr(); - T* bound_value_ptr = memory_buffer.bound_value_ptr(); - T* class_interval_ptr = memory_buffer.class_interval_ptr(); - void* cub_temp_storage_ptr = memory_buffer.cub_temp_storage_ptr(); - - // step 4: Calculate class interval among nranks - PADDLE_ENFORCE_GPU_SUCCESS( - (cub::DeviceScan::InclusiveSum(cub_temp_storage_ptr, - cub_temp_storage_bytes, - num_classes_per_device_ptr, - class_interval_ptr, - nranks + 1, - ctx.cuda_device_context().stream()))); - - // step 5: random sample negative class center - uint64_t seed_data; - uint64_t increment; - int vec_size = VectorizedSize(cub_sort_keys_ptr); - auto offset = ((num_classes - 1) / - (NumBlocks(num_classes) * kNumCUDAThreads * vec_size) + - 1) * - vec_size; - int device_id = ctx.GetPlace().GetDeviceId(); - auto gen_cuda = framework::DefaultCUDAGenerator(device_id); - if (!fix_seed) { - auto seed_offset = gen_cuda->IncrementOffset(offset); - seed_data = seed_offset.first; - increment = seed_offset.second; - } else { - seed_data = seed + rank; - increment = offset; - } - RandomSampleClassCenter<<>>( - num_classes, seed_data, increment, num_classes, cub_sort_keys_ptr); - - // step 6: mark positive class center as negative value - // fill the sort values to index 0, 1, ..., batch_size-1 - MarkPositiveClassCenter<<>>( - batch_size, - rank, - class_interval_ptr, - num_classes, - label->data(), - cub_sort_keys_ptr); - Range<<>>(num_buffer_ele, - cub_sort_values_ptr); - - // step 7: sort class center by ascending, so that positive class center - // always be sampled. - PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceRadixSort::SortPairs( - cub_temp_storage_ptr, - cub_temp_storage_bytes, - cub_sort_keys_ptr, - cub_sort_keys_out_ptr, - cub_sort_values_ptr, - cub_sort_values_out_ptr, - num_classes, - 0, - sizeof(T) * 8, - ctx.cuda_device_context().stream()))); - - // step 8: sort input label ascending - PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceRadixSort::SortPairs( - cub_temp_storage_ptr, - cub_temp_storage_bytes, - label->data(), - cub_sort_keys_out_ptr, - cub_sort_values_ptr, - cub_sort_keys_ptr, - batch_size, - 0, - sizeof(T) * 8, - ctx.cuda_device_context().stream()))); - - // step 9: Calculate new index using InclusiveSum on ascending sorted input - // label - NotEqualToPreviousAdjacentIterator unique_counting_iter( - cub_sort_keys_out_ptr, 0); - PADDLE_ENFORCE_GPU_SUCCESS(( - cub::DeviceScan::InclusiveSum, - T*>(cub_temp_storage_ptr, - cub_temp_storage_bytes, - unique_counting_iter, - cub_sort_values_ptr, - batch_size, - ctx.cuda_device_context().stream()))); - - // step 10: Calculate new class center bound among ranks - GetClassCenterBound - <<>>(batch_size, - nranks, - class_interval_ptr, - cub_sort_keys_out_ptr, - cub_sort_values_ptr, - bound_index_ptr, - bound_value_ptr); - - // step 11: Calculate actual number of sampled class per device. - // Since maybe num_positive_class_center > num_samples, - // we need to ensure all positive class center per device are sampled. - ActualNumSampledFunctor actual_num_sampled_op(num_samples); - PADDLE_ENFORCE_GPU_SUCCESS( - (cub::DeviceScan::InclusiveScan(cub_temp_storage_ptr, - cub_temp_storage_bytes, - bound_value_ptr, - num_classes_per_device_ptr, - actual_num_sampled_op, - nranks + 1, - ctx.cuda_device_context().stream()))); - - // step 12: Calculate actual sampled class interval among nranks - PADDLE_ENFORCE_GPU_SUCCESS( - (cub::DeviceScan::InclusiveSum(cub_temp_storage_ptr, - cub_temp_storage_bytes, - num_classes_per_device_ptr, - class_interval_ptr, - nranks + 1, - ctx.cuda_device_context().stream()))); - - // step 13: Get remapped label for output - GetRemappedLabel<<>>( - batch_size, - nranks, - class_interval_ptr, - bound_index_ptr, - bound_value_ptr, - cub_sort_keys_ptr, - cub_sort_values_ptr, - remapped_label->mutable_data(ctx.GetPlace())); - - // step 14: Get sampled class center for output - framework::TensorCopySync( - num_classes_per_device, platform::CPUPlace(), &num_classes_per_device); - T actual_num_samples = num_classes_per_device.data()[rank + 1]; - T* sampled_local_class_center_ptr = - sampled_local_class_center->mutable_data({actual_num_samples}, - ctx.GetPlace()); - memory::Copy(place, - sampled_local_class_center_ptr, - place, - cub_sort_values_out_ptr, - actual_num_samples * sizeof(T), - nullptr); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - class_center_sample, - ops::ClassCenterSampleCUDAKernel, - ops::ClassCenterSampleCUDAKernel); diff --git a/paddle/fluid/operators/class_center_sample_op.h b/paddle/fluid/operators/class_center_sample_op.h deleted file mode 100644 index 3efb33631f7ec..0000000000000 --- a/paddle/fluid/operators/class_center_sample_op.h +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include - -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -template -class ClassCenterSampleCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* label = ctx.Input("Label"); - auto* remapped_label = ctx.Output("RemappedLabel"); - auto* sampled_local_class_center = - ctx.Output("SampledLocalClassCenter"); - int num_classes = ctx.Attr("num_classes"); - int num_samples = ctx.Attr("num_samples"); - - int seed = ctx.Attr("seed"); - bool fix_seed = ctx.Attr("fix_seed"); - PADDLE_ENFORCE_GT(num_classes, - 0, - platform::errors::InvalidArgument( - "The value 'num_classes' for Op(class_center_sample) " - "must be greater than 0, " - "but the value given is %d.", - num_classes)); - - PADDLE_ENFORCE_GT(num_samples, - 0, - platform::errors::InvalidArgument( - "The value 'num_samples' for Op(class_center_sample) " - "must be greater than 0, " - "but the value given is %d.", - num_samples)); - - PADDLE_ENFORCE_LE(num_samples, - num_classes, - platform::errors::InvalidArgument( - "The value 'num_samples' for Op(class_center_sample) " - "must be less than or equal to %d, " - "but the value given is %d.", - num_classes, - num_samples)); - - int64_t numel = label->numel(); - auto* label_ptr = label->data(); - - // get unique positive class center by ascending - std::set> unique_label; - for (int64_t i = 0; i < numel; ++i) { - unique_label.insert(label_ptr[i]); - } - - // constrcut a lookup table and get sampled_local_class_center - std::vector actual_sampled; - std::map new_class_dict; - T idx = 0; - for (auto& t : unique_label) { - new_class_dict[t] = idx; - actual_sampled.push_back(t); - idx++; - } - - if (!fix_seed) { - std::random_device rnd; - seed = rnd(); - } - std::uniform_int_distribution dist(0, num_classes - 1); - auto engine = framework::GetCPURandomEngine(seed); - // sample negative class center randomly - while (unique_label.size() < static_cast(num_samples)) { - T neg = dist(*engine); - if (unique_label.find(neg) == unique_label.end()) { - unique_label.insert(neg); - // unorder for negative class center - actual_sampled.push_back(neg); - } - } - - int actual_num_samples = unique_label.size(); - T* sampled_local_class_center_ptr = - sampled_local_class_center->mutable_data({actual_num_samples}, - ctx.GetPlace()); - idx = 0; - for (auto& t : actual_sampled) { - sampled_local_class_center_ptr[idx] = t; - idx++; - } - - // remap the input label to sampled class - auto* remmaped_label_ptr = remapped_label->mutable_data(ctx.GetPlace()); - for (int64_t i = 0; i < numel; ++i) { - remmaped_label_ptr[i] = new_class_dict[label_ptr[i]]; - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/conv_transpose_op_mlu.cc b/paddle/fluid/operators/conv_transpose_op_mlu.cc index 322328b1c2e72..f757898886e1f 100644 --- a/paddle/fluid/operators/conv_transpose_op_mlu.cc +++ b/paddle/fluid/operators/conv_transpose_op_mlu.cc @@ -271,26 +271,18 @@ class Conv2DTransposeGradMLUKernel : public framework::OpKernel { data_layout_mlu, ToCnnlDataType(input_grad_tensor.dtype())); - cnnlDataType_t tensor_dtype = ToCnnlDataType(); - cnnlDataType_t dt_onchip = ToCnnlDataType(); - MLUCnnl::Conv2D(ctx, - conv_desc.get(), - tensor_dtype, - dt_onchip, - nullptr /* input_position */, - nullptr /* input_scale */, - nullptr /* input_offset */, - nullptr /* filter_position */, - nullptr /* filter_scale */, - nullptr /* filter_offset */, - output_grad_desc.get(), - GetBasePtr(&output_grad_tensor), - trans_filter_desc.get(), - GetBasePtr(&trans_filter), - nullptr /* bias_desc*/, - nullptr /* bias */, - input_grad_desc.get(), - GetBasePtr(&input_grad_tensor)); + MLUCnnl::ConvolutionForward(ctx, + conv_desc.get(), + nullptr /*alpha*/, + nullptr /*beta*/, + nullptr /*bias_desc*/, + nullptr /*bias_ptr*/, + output_grad_desc.get(), + GetBasePtr(&output_grad_tensor), + trans_filter_desc.get(), + GetBasePtr(&trans_filter), + input_grad_desc.get(), + GetBasePtr(&input_grad_tensor)); if (!channel_last) { // transpose output from NHWC to NCHW const std::vector perm_to_nchw = {0, 3, 1, 2}; diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 775b2a4f8bfa9..578827f56cbc0 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -50,12 +50,11 @@ elseif(WITH_MLU) elseif(WITH_ASCEND_CL) detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_npu.cc) - detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu - prior_box_op_npu.cc) + detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_npu.cc) else() detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op.cu) - detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) + detection_library(prior_box_op SRCS prior_box_op.cc) # detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc) endif() @@ -93,7 +92,7 @@ if(WITH_GPU OR WITH_ROCM) detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS}) detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc - generate_proposals_v2_op.cu DEPS ${TMPDEPS}) + DEPS ${TMPDEPS}) detection_library( distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS ${TMPDEPS}) diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc index 26db517690ef6..ec8d8a71008cb 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc @@ -109,7 +109,7 @@ class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker { "The referring scale of FPN layer with" " specified level"); AddAttr("pixel_offset", - "(bool, default True),", + "(bool, default True)," "If true, im_shape pixel offset is 1.") .SetDefault(true); AddComment(R"DOC( diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index 29d7347f1ba75..0118cc1f76b3f 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" -#include "paddle/fluid/operators/detection/nms_util.h" +#include "paddle/phi/kernels/funcs/detection/nms_util.h" #include "paddle/phi/kernels/funcs/gather.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -251,7 +251,8 @@ class GenerateProposalsKernel : public framework::OpKernel { return std::make_pair(bbox_sel, scores_filter); } - Tensor keep_nms = NMS(ctx, &bbox_sel, &scores_filter, nms_thresh, eta); + Tensor keep_nms = + phi::funcs::NMS(ctx, &bbox_sel, &scores_filter, nms_thresh, eta); if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { keep_nms.Resize({post_nms_top_n}); diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc index 450154bec4e17..eeda4c819e12a 100644 --- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc @@ -17,10 +17,12 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" -#include "paddle/fluid/operators/detection/nms_util.h" +#include "paddle/phi/infermeta/multiary.h" +#include "paddle/phi/kernels/funcs/detection/nms_util.h" #include "paddle/phi/kernels/funcs/gather.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -34,36 +36,6 @@ class GenerateProposalsV2Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Scores"), - true, - platform::errors::NotFound("Input(Scores) shouldn't be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("BboxDeltas"), - true, - platform::errors::NotFound("Input(BboxDeltas) shouldn't be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("ImShape"), - true, - platform::errors::NotFound("Input(ImShape) shouldn't be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Anchors"), - true, - platform::errors::NotFound("Input(Anchors) shouldn't be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Variances"), - true, - platform::errors::NotFound("Input(Variances) shouldn't be null.")); - - ctx->SetOutputDim("RpnRois", {-1, 4}); - ctx->SetOutputDim("RpnRoiProbs", {-1, 1}); - if (!ctx->IsRuntime()) { - ctx->SetLoDLevel("RpnRois", std::max(ctx->GetLoDLevel("Scores"), 1)); - ctx->SetLoDLevel("RpnRoiProbs", std::max(ctx->GetLoDLevel("Scores"), 1)); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -73,206 +45,6 @@ class GenerateProposalsV2Op : public framework::OperatorWithKernel { } }; -template -class GenerateProposalsV2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *scores = context.Input("Scores"); - auto *bbox_deltas = context.Input("BboxDeltas"); - auto *im_shape = context.Input("ImShape"); - auto anchors = GET_DATA_SAFELY(context.Input("Anchors"), - "Input", - "Anchors", - "GenerateProposals"); - auto variances = GET_DATA_SAFELY(context.Input("Variances"), - "Input", - "Variances", - "GenerateProposals"); - - auto *rpn_rois = context.Output("RpnRois"); - auto *rpn_roi_probs = context.Output("RpnRoiProbs"); - - int pre_nms_top_n = context.Attr("pre_nms_topN"); - int post_nms_top_n = context.Attr("post_nms_topN"); - float nms_thresh = context.Attr("nms_thresh"); - float min_size = context.Attr("min_size"); - float eta = context.Attr("eta"); - bool pixel_offset = context.Attr("pixel_offset"); - - auto &dev_ctx = context.template device_context(); - - auto &scores_dim = scores->dims(); - int64_t num = scores_dim[0]; - int64_t c_score = scores_dim[1]; - int64_t h_score = scores_dim[2]; - int64_t w_score = scores_dim[3]; - - auto &bbox_dim = bbox_deltas->dims(); - int64_t c_bbox = bbox_dim[1]; - int64_t h_bbox = bbox_dim[2]; - int64_t w_bbox = bbox_dim[3]; - - rpn_rois->mutable_data({bbox_deltas->numel() / 4, 4}, - context.GetPlace()); - rpn_roi_probs->mutable_data({scores->numel(), 1}, context.GetPlace()); - - Tensor bbox_deltas_swap, scores_swap; - bbox_deltas_swap.mutable_data({num, h_bbox, w_bbox, c_bbox}, - dev_ctx.GetPlace()); - scores_swap.mutable_data({num, h_score, w_score, c_score}, - dev_ctx.GetPlace()); - - phi::funcs::Transpose trans; - std::vector axis = {0, 2, 3, 1}; - trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis); - trans(dev_ctx, *scores, &scores_swap, axis); - - framework::LoD lod; - lod.resize(1); - auto &lod0 = lod[0]; - lod0.push_back(0); - anchors.Resize({anchors.numel() / 4, 4}); - variances.Resize({variances.numel() / 4, 4}); - std::vector tmp_num; - - int64_t num_proposals = 0; - for (int64_t i = 0; i < num; ++i) { - Tensor im_shape_slice = im_shape->Slice(i, i + 1); - Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); - Tensor scores_slice = scores_swap.Slice(i, i + 1); - - bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); - scores_slice.Resize({h_score * w_score * c_score, 1}); - - std::pair tensor_pair = - ProposalForOneImage(dev_ctx, - im_shape_slice, - anchors, - variances, - bbox_deltas_slice, - scores_slice, - pre_nms_top_n, - post_nms_top_n, - nms_thresh, - min_size, - eta, - pixel_offset); - Tensor &proposals = tensor_pair.first; - Tensor &scores = tensor_pair.second; - - AppendProposals(rpn_rois, 4 * num_proposals, proposals); - AppendProposals(rpn_roi_probs, num_proposals, scores); - num_proposals += proposals.dims()[0]; - lod0.push_back(num_proposals); - tmp_num.push_back(proposals.dims()[0]); - } - if (context.HasOutput("RpnRoisNum")) { - auto *rpn_rois_num = context.Output("RpnRoisNum"); - rpn_rois_num->mutable_data({num}, context.GetPlace()); - int *num_data = rpn_rois_num->data(); - for (int i = 0; i < num; i++) { - num_data[i] = tmp_num[i]; - } - rpn_rois_num->Resize({num}); - } - rpn_rois->set_lod(lod); - rpn_roi_probs->set_lod(lod); - rpn_rois->Resize({num_proposals, 4}); - rpn_roi_probs->Resize({num_proposals, 1}); - } - - std::pair ProposalForOneImage( - const phi::CPUContext &ctx, - const Tensor &im_shape_slice, - const Tensor &anchors, - const Tensor &variances, - const Tensor &bbox_deltas_slice, // [M, 4] - const Tensor &scores_slice, // [N, 1] - int pre_nms_top_n, - int post_nms_top_n, - float nms_thresh, - float min_size, - float eta, - bool pixel_offset = true) const { - auto *scores_data = scores_slice.data(); - - // Sort index - Tensor index_t; - index_t.Resize({scores_slice.numel()}); - int *index = index_t.mutable_data(ctx.GetPlace()); - for (int i = 0; i < scores_slice.numel(); ++i) { - index[i] = i; - } - auto compare = [scores_data](const int64_t &i, const int64_t &j) { - return scores_data[i] > scores_data[j]; - }; - - if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) { - std::sort(index, index + scores_slice.numel(), compare); - } else { - std::nth_element( - index, index + pre_nms_top_n, index + scores_slice.numel(), compare); - index_t.Resize({pre_nms_top_n}); - } - - Tensor scores_sel, bbox_sel, anchor_sel, var_sel; - scores_sel.mutable_data({index_t.numel(), 1}, ctx.GetPlace()); - bbox_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); - anchor_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); - var_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); - - phi::funcs::CPUGather(ctx, scores_slice, index_t, &scores_sel); - phi::funcs::CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); - phi::funcs::CPUGather(ctx, anchors, index_t, &anchor_sel); - phi::funcs::CPUGather(ctx, variances, index_t, &var_sel); - - Tensor proposals; - proposals.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); - BoxCoder( - ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals, pixel_offset); - - ClipTiledBoxes( - ctx, im_shape_slice, proposals, &proposals, false, pixel_offset); - - Tensor keep; - FilterBoxes( - ctx, &proposals, min_size, im_shape_slice, false, &keep, pixel_offset); - // Handle the case when there is no keep index left - if (keep.numel() == 0) { - phi::funcs::SetConstant set_zero; - bbox_sel.mutable_data({1, 4}, ctx.GetPlace()); - set_zero(ctx, &bbox_sel, static_cast(0)); - Tensor scores_filter; - scores_filter.mutable_data({1, 1}, ctx.GetPlace()); - set_zero(ctx, &scores_filter, static_cast(0)); - return std::make_pair(bbox_sel, scores_filter); - } - - Tensor scores_filter; - bbox_sel.mutable_data({keep.numel(), 4}, ctx.GetPlace()); - scores_filter.mutable_data({keep.numel(), 1}, ctx.GetPlace()); - phi::funcs::CPUGather(ctx, proposals, keep, &bbox_sel); - phi::funcs::CPUGather(ctx, scores_sel, keep, &scores_filter); - if (nms_thresh <= 0) { - return std::make_pair(bbox_sel, scores_filter); - } - - Tensor keep_nms = - NMS(ctx, &bbox_sel, &scores_filter, nms_thresh, eta, pixel_offset); - - if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { - keep_nms.Resize({post_nms_top_n}); - } - - proposals.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); - scores_sel.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); - phi::funcs::CPUGather(ctx, bbox_sel, keep_nms, &proposals); - phi::funcs::CPUGather(ctx, scores_filter, keep_nms, &scores_sel); - - return std::make_pair(proposals, scores_sel); - } -}; - class GenerateProposalsV2OpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -310,7 +82,7 @@ class GenerateProposalsV2OpMaker : public framework::OpProtoAndCheckerMaker { "than this min_size."); AddAttr("eta", "The parameter for adaptive NMS."); AddAttr("pixel_offset", - "(bool, default True),", + "(bool, default True)," "If true, im_shape pixel offset is 1.") .SetDefault(true); AddComment(R"DOC( @@ -336,16 +108,19 @@ to before and will not effect the result. } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(generate_proposals_v2, + GenerateProposalsV2InferShapeFunctor, + PD_INFER_META(phi::GenerateProposalsV2InferMeta)); + namespace ops = paddle::operators; REGISTER_OPERATOR( generate_proposals_v2, ops::GenerateProposalsV2Op, ops::GenerateProposalsV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(generate_proposals_v2, - ops::GenerateProposalsV2Kernel, - ops::GenerateProposalsV2Kernel); + paddle::framework::EmptyGradOpMaker, + GenerateProposalsV2InferShapeFunctor); + REGISTER_OP_VERSION(generate_proposals_v2) .AddCheckpoint( R"ROC(Registe generate_proposals_v2 for adding the attribute of pixel_offset)ROC", diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu deleted file mode 100644 index 682a9adf65952..0000000000000 --- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu +++ /dev/null @@ -1,277 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include -#include - -#include "paddle/fluid/framework/mixed_vector.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/detection/bbox_util.cu.h" -#include "paddle/phi/kernels/funcs/gather.cu.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -namespace { -template -static std::pair ProposalForOneImage( - const phi::GPUContext &ctx, - const Tensor &im_shape, - const Tensor &anchors, - const Tensor &variances, - const Tensor &bbox_deltas, // [M, 4] - const Tensor &scores, // [N, 1] - int pre_nms_top_n, - int post_nms_top_n, - float nms_thresh, - float min_size, - float eta, - bool pixel_offset) { - // 1. pre nms - Tensor scores_sort, index_sort; - SortDescending(ctx, scores, &scores_sort, &index_sort); - int num = scores.numel(); - int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel() - : pre_nms_top_n; - scores_sort.Resize({pre_nms_num, 1}); - index_sort.Resize({pre_nms_num, 1}); - - // 2. box decode and clipping - Tensor proposals; - proposals.mutable_data({pre_nms_num, 4}, ctx.GetPlace()); - - { - platform::ForRange for_range(ctx, pre_nms_num); - for_range(BoxDecodeAndClipFunctor{anchors.data(), - bbox_deltas.data(), - variances.data(), - index_sort.data(), - im_shape.data(), - proposals.data(), - pixel_offset}); - } - - // 3. filter - Tensor keep_index, keep_num_t; - keep_index.mutable_data({pre_nms_num}, ctx.GetPlace()); - keep_num_t.mutable_data({1}, ctx.GetPlace()); - min_size = std::max(min_size, 1.0f); - auto stream = ctx.stream(); - FilterBBoxes<<<1, 512, 0, stream>>>(proposals.data(), - im_shape.data(), - min_size, - pre_nms_num, - keep_num_t.data(), - keep_index.data(), - false, - pixel_offset); - int keep_num; - const auto gpu_place = ctx.GetPlace(); - memory::Copy(platform::CPUPlace(), - &keep_num, - gpu_place, - keep_num_t.data(), - sizeof(int), - ctx.stream()); - ctx.Wait(); - keep_index.Resize({keep_num}); - - Tensor scores_filter, proposals_filter; - // Handle the case when there is no keep index left - if (keep_num == 0) { - phi::funcs::SetConstant set_zero; - proposals_filter.mutable_data({1, 4}, ctx.GetPlace()); - scores_filter.mutable_data({1, 1}, ctx.GetPlace()); - set_zero(ctx, &proposals_filter, static_cast(0)); - set_zero(ctx, &scores_filter, static_cast(0)); - return std::make_pair(proposals_filter, scores_filter); - } - proposals_filter.mutable_data({keep_num, 4}, ctx.GetPlace()); - scores_filter.mutable_data({keep_num, 1}, ctx.GetPlace()); - phi::funcs::GPUGather(ctx, proposals, keep_index, &proposals_filter); - phi::funcs::GPUGather(ctx, scores_sort, keep_index, &scores_filter); - - if (nms_thresh <= 0) { - return std::make_pair(proposals_filter, scores_filter); - } - - // 4. nms - Tensor keep_nms; - NMS( - ctx, proposals_filter, keep_index, nms_thresh, &keep_nms, pixel_offset); - if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { - keep_nms.Resize({post_nms_top_n}); - } - - Tensor scores_nms, proposals_nms; - proposals_nms.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); - scores_nms.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); - phi::funcs::GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); - phi::funcs::GPUGather(ctx, scores_filter, keep_nms, &scores_nms); - - return std::make_pair(proposals_nms, scores_nms); -} -} // namespace - -template -class CUDAGenerateProposalsV2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *scores = context.Input("Scores"); - auto *bbox_deltas = context.Input("BboxDeltas"); - auto *im_shape = context.Input("ImShape"); - auto anchors = GET_DATA_SAFELY(context.Input("Anchors"), - "Input", - "Anchors", - "GenerateProposals"); - auto variances = GET_DATA_SAFELY(context.Input("Variances"), - "Input", - "Variances", - "GenerateProposals"); - - auto *rpn_rois = context.Output("RpnRois"); - auto *rpn_roi_probs = context.Output("RpnRoiProbs"); - - int pre_nms_top_n = context.Attr("pre_nms_topN"); - int post_nms_top_n = context.Attr("post_nms_topN"); - float nms_thresh = context.Attr("nms_thresh"); - float min_size = context.Attr("min_size"); - float eta = context.Attr("eta"); - bool pixel_offset = context.Attr("pixel_offset"); - PADDLE_ENFORCE_GE(eta, - 1., - platform::errors::InvalidArgument( - "Not support adaptive NMS. The attribute 'eta' " - "should not less than 1. But received eta=[%d]", - eta)); - - auto &dev_ctx = context.template device_context(); - - auto scores_dim = scores->dims(); - int64_t num = scores_dim[0]; - int64_t c_score = scores_dim[1]; - int64_t h_score = scores_dim[2]; - int64_t w_score = scores_dim[3]; - - auto bbox_dim = bbox_deltas->dims(); - int64_t c_bbox = bbox_dim[1]; - int64_t h_bbox = bbox_dim[2]; - int64_t w_bbox = bbox_dim[3]; - - Tensor bbox_deltas_swap, scores_swap; - bbox_deltas_swap.mutable_data({num, h_bbox, w_bbox, c_bbox}, - dev_ctx.GetPlace()); - scores_swap.mutable_data({num, h_score, w_score, c_score}, - dev_ctx.GetPlace()); - - phi::funcs::Transpose trans; - std::vector axis = {0, 2, 3, 1}; - trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis); - trans(dev_ctx, *scores, &scores_swap, axis); - - anchors.Resize({anchors.numel() / 4, 4}); - variances.Resize({variances.numel() / 4, 4}); - - rpn_rois->mutable_data({bbox_deltas->numel() / 4, 4}, - context.GetPlace()); - rpn_roi_probs->mutable_data({scores->numel(), 1}, context.GetPlace()); - - T *rpn_rois_data = rpn_rois->data(); - T *rpn_roi_probs_data = rpn_roi_probs->data(); - - auto place = dev_ctx.GetPlace(); - auto cpu_place = platform::CPUPlace(); - - int64_t num_proposals = 0; - std::vector offset(1, 0); - std::vector tmp_num; - - for (int64_t i = 0; i < num; ++i) { - Tensor im_shape_slice = im_shape->Slice(i, i + 1); - Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); - Tensor scores_slice = scores_swap.Slice(i, i + 1); - - bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); - scores_slice.Resize({h_score * w_score * c_score, 1}); - - std::pair box_score_pair = - ProposalForOneImage(dev_ctx, - im_shape_slice, - anchors, - variances, - bbox_deltas_slice, - scores_slice, - pre_nms_top_n, - post_nms_top_n, - nms_thresh, - min_size, - eta, - pixel_offset); - - Tensor &proposals = box_score_pair.first; - Tensor &scores = box_score_pair.second; - - memory::Copy(place, - rpn_rois_data + num_proposals * 4, - place, - proposals.data(), - sizeof(T) * proposals.numel(), - dev_ctx.stream()); - memory::Copy(place, - rpn_roi_probs_data + num_proposals, - place, - scores.data(), - sizeof(T) * scores.numel(), - dev_ctx.stream()); - dev_ctx.Wait(); - num_proposals += proposals.dims()[0]; - offset.emplace_back(num_proposals); - tmp_num.push_back(proposals.dims()[0]); - } - if (context.HasOutput("RpnRoisNum")) { - auto *rpn_rois_num = context.Output("RpnRoisNum"); - rpn_rois_num->mutable_data({num}, context.GetPlace()); - int *num_data = rpn_rois_num->data(); - memory::Copy(place, - num_data, - cpu_place, - &tmp_num[0], - sizeof(int) * num, - dev_ctx.stream()); - rpn_rois_num->Resize({num}); - } - framework::LoD lod; - lod.emplace_back(offset); - rpn_rois->set_lod(lod); - rpn_roi_probs->set_lod(lod); - rpn_rois->Resize({num_proposals, 4}); - rpn_roi_probs->Resize({num_proposals, 1}); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - generate_proposals_v2, - ops::CUDAGenerateProposalsV2Kernel); diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc index 6fb48229517d3..16e2c28265d14 100644 --- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc +++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detection/nms_util.h" +#include "paddle/phi/kernels/funcs/detection/nms_util.h" namespace paddle { namespace operators { @@ -118,15 +118,15 @@ void GetMaxScoreIndexWithLocalityAware( if (index > -1) { T overlap = T(0.); if (box_size == 4) { - overlap = JaccardOverlap( + overlap = phi::funcs::JaccardOverlap( bbox_data + i * box_size, bbox_data + index * box_size, normalized); } // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32 if (box_size == 8 || box_size == 16 || box_size == 24 || box_size == 32) { - overlap = PolyIoU(bbox_data + i * box_size, - bbox_data + index * box_size, - box_size, - normalized); + overlap = phi::funcs::PolyIoU(bbox_data + i * box_size, + bbox_data + index * box_size, + box_size, + normalized); } if (overlap > nms_threshold) { @@ -156,7 +156,7 @@ void GetMaxScoreIndexWithLocalityAware( // Sort the score pair according to the scores in descending order std::stable_sort(sorted_indices->begin(), sorted_indices->end(), - SortScorePairDescend); + phi::funcs::SortScorePairDescend); // Keep top_k scores if needed. if (top_k > -1 && top_k < static_cast(sorted_indices->size())) { sorted_indices->resize(top_k); @@ -207,17 +207,18 @@ class LocalityAwareNMSKernel : public framework::OpKernel { T overlap = T(0.); // 4: [xmin ymin xmax ymax] if (box_size == 4) { - overlap = JaccardOverlap(bbox_data + idx * box_size, - bbox_data + kept_idx * box_size, - normalized); + overlap = + phi::funcs::JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, + normalized); } // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32 if (box_size == 8 || box_size == 16 || box_size == 24 || box_size == 32) { - overlap = PolyIoU(bbox_data + idx * box_size, - bbox_data + kept_idx * box_size, - box_size, - normalized); + overlap = phi::funcs::PolyIoU(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, + box_size, + normalized); } keep = overlap <= adaptive_threshold; } else { @@ -290,7 +291,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel { // Keep top k results per image. std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), - SortScorePairDescend>); + phi::funcs::SortScorePairDescend>); score_index_pairs.resize(keep_top_k); // Store the new indices. diff --git a/paddle/fluid/operators/detection/matrix_nms_op.cc b/paddle/fluid/operators/detection/matrix_nms_op.cc index feacea63e390f..1c0d19d9d5937 100644 --- a/paddle/fluid/operators/detection/matrix_nms_op.cc +++ b/paddle/fluid/operators/detection/matrix_nms_op.cc @@ -14,8 +14,8 @@ limitations under the License. */ #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/detection/nms_util.h" #include "paddle/phi/infermeta/binary.h" +#include "paddle/phi/kernels/funcs/detection/nms_util.h" namespace paddle { namespace operators { @@ -85,7 +85,7 @@ class MatrixNMSOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(false); AddAttr("gaussian_sigma", "(float) " - "Sigma for Gaussian decreasing function, only takes effect ", + "Sigma for Gaussian decreasing function, only takes effect " "when 'use_gaussian' is enabled.") .SetDefault(2.); AddOutput("Out", diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc index 7f0bb2a97ce27..67b26ddbc2df9 100644 --- a/paddle/fluid/operators/detection/multiclass_nms_op.cc +++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc @@ -15,8 +15,8 @@ limitations under the License. */ #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detection/nms_util.h" #include "paddle/phi/infermeta/ternary.h" +#include "paddle/phi/kernels/funcs/detection/nms_util.h" namespace paddle { namespace operators { @@ -166,7 +166,8 @@ class MultiClassNMSKernel : public framework::OpKernel { std::vector scores_data(num_boxes); std::copy_n(scores.data(), num_boxes, scores_data.begin()); std::vector> sorted_indices; - GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices); + phi::funcs::GetMaxScoreIndex( + scores_data, score_threshold, top_k, &sorted_indices); selected_indices->clear(); T adaptive_threshold = nms_threshold; @@ -181,17 +182,18 @@ class MultiClassNMSKernel : public framework::OpKernel { T overlap = T(0.); // 4: [xmin ymin xmax ymax] if (box_size == 4) { - overlap = JaccardOverlap(bbox_data + idx * box_size, - bbox_data + kept_idx * box_size, - normalized); + overlap = + phi::funcs::JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, + normalized); } // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32 if (box_size == 8 || box_size == 16 || box_size == 24 || box_size == 32) { - overlap = PolyIoU(bbox_data + idx * box_size, - bbox_data + kept_idx * box_size, - box_size, - normalized); + overlap = phi::funcs::PolyIoU(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, + box_size, + normalized); } keep = overlap <= adaptive_threshold; } else { @@ -276,7 +278,7 @@ class MultiClassNMSKernel : public framework::OpKernel { // Keep top k results per image. std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), - SortScorePairDescend>); + phi::funcs::SortScorePairDescend>); score_index_pairs.resize(keep_top_k); // Store the new indices. diff --git a/paddle/fluid/operators/detection/prior_box_op.cu b/paddle/fluid/operators/detection/prior_box_op.cu deleted file mode 100644 index 1cdf769133829..0000000000000 --- a/paddle/fluid/operators/detection/prior_box_op.cu +++ /dev/null @@ -1,195 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/detection/prior_box_op.h" - -namespace paddle { -namespace operators { - -template -__device__ inline T clip(T in) { - return min(max(in, 0.), 1.); -} - -template -__global__ void GenPriorBox(T* out, - const T* aspect_ratios, - const int height, - const int width, - const int im_height, - const int im_width, - const int as_num, - const T offset, - const T step_width, - const T step_height, - const T* min_sizes, - const T* max_sizes, - const int min_num, - bool is_clip, - bool min_max_aspect_ratios_order) { - int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num; - int box_num = height * width * num_priors; - CUDA_KERNEL_LOOP(i, box_num) { - int h = i / (num_priors * width); - int w = (i / num_priors) % width; - int p = i % num_priors; - int m = max_sizes ? p / (as_num + 1) : p / as_num; - T cx = (w + offset) * step_width; - T cy = (h + offset) * step_height; - T bw, bh; - T min_size = min_sizes[m]; - if (max_sizes) { - int s = p % (as_num + 1); - if (!min_max_aspect_ratios_order) { - if (s < as_num) { - T ar = aspect_ratios[s]; - bw = min_size * sqrt(ar) / 2.; - bh = min_size / sqrt(ar) / 2.; - } else { - T max_size = max_sizes[m]; - bw = sqrt(min_size * max_size) / 2.; - bh = bw; - } - } else { - if (s == 0) { - bw = bh = min_size / 2.; - } else if (s == 1) { - T max_size = max_sizes[m]; - bw = sqrt(min_size * max_size) / 2.; - bh = bw; - } else { - T ar = aspect_ratios[s - 1]; - bw = min_size * sqrt(ar) / 2.; - bh = min_size / sqrt(ar) / 2.; - } - } - } else { - int s = p % as_num; - T ar = aspect_ratios[s]; - bw = min_size * sqrt(ar) / 2.; - bh = min_size / sqrt(ar) / 2.; - } - T xmin = (cx - bw) / im_width; - T ymin = (cy - bh) / im_height; - T xmax = (cx + bw) / im_width; - T ymax = (cy + bh) / im_height; - out[i * 4] = is_clip ? clip(xmin) : xmin; - out[i * 4 + 1] = is_clip ? clip(ymin) : ymin; - out[i * 4 + 2] = is_clip ? clip(xmax) : xmax; - out[i * 4 + 3] = is_clip ? clip(ymax) : ymax; - } -} - -template -__global__ void SetVariance(T* out, - const T* var, - const int vnum, - const int num) { - CUDA_KERNEL_LOOP(i, num) { out[i] = var[i % vnum]; } -} - -template -class PriorBoxOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto* image = ctx.Input("Image"); - auto* boxes = ctx.Output("Boxes"); - auto* vars = ctx.Output("Variances"); - - auto min_sizes = ctx.Attr>("min_sizes"); - auto max_sizes = ctx.Attr>("max_sizes"); - auto input_aspect_ratio = ctx.Attr>("aspect_ratios"); - auto variances = ctx.Attr>("variances"); - auto flip = ctx.Attr("flip"); - auto clip = ctx.Attr("clip"); - auto min_max_aspect_ratios_order = - ctx.Attr("min_max_aspect_ratios_order"); - - std::vector aspect_ratios; - ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); - - T step_w = static_cast(ctx.Attr("step_w")); - T step_h = static_cast(ctx.Attr("step_h")); - T offset = static_cast(ctx.Attr("offset")); - - auto im_width = image->dims()[3]; - auto im_height = image->dims()[2]; - - auto width = input->dims()[3]; - auto height = input->dims()[2]; - - T step_width, step_height; - if (step_w == 0 || step_h == 0) { - step_width = static_cast(im_width) / width; - step_height = static_cast(im_height) / height; - } else { - step_width = step_w; - step_height = step_h; - } - - int num_priors = aspect_ratios.size() * min_sizes.size(); - if (max_sizes.size() > 0) { - num_priors += max_sizes.size(); - } - int min_num = static_cast(min_sizes.size()); - int box_num = width * height * num_priors; - - int block = 512; - int grid = (box_num + block - 1) / block; - - auto stream = ctx.template device_context().stream(); - - boxes->mutable_data(ctx.GetPlace()); - vars->mutable_data(ctx.GetPlace()); - - framework::Tensor r; - framework::TensorFromVector(aspect_ratios, ctx.device_context(), &r); - - framework::Tensor min; - framework::TensorFromVector(min_sizes, ctx.device_context(), &min); - - T* max_data = nullptr; - framework::Tensor max; - if (max_sizes.size() > 0) { - framework::TensorFromVector(max_sizes, ctx.device_context(), &max); - max_data = max.data(); - } - - GenPriorBox<<>>(boxes->data(), - r.data(), - height, - width, - im_height, - im_width, - aspect_ratios.size(), - offset, - step_width, - step_height, - min.data(), - max_data, - min_num, - clip, - min_max_aspect_ratios_order); - - framework::Tensor v; - framework::TensorFromVector(variances, ctx.device_context(), &v); - grid = (box_num * 4 + block - 1) / block; - SetVariance<<>>( - vars->data(), v.data(), variances.size(), box_num * 4); - } -}; // namespace operators - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/erfinv_op.cc b/paddle/fluid/operators/erfinv_op.cc deleted file mode 100644 index 1e615aa7be7fd..0000000000000 --- a/paddle/fluid/operators/erfinv_op.cc +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle { -namespace operators { - -class ErfinvOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; -}; - -class ErfinvOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor), The input tensor of erfinv op."); - AddOutput("Out", "(Tensor), The output tensor of erfinv op."); - AddComment(R"DOC( -Erfinv Operator. - -This operator is used to compute inverse error function of input $X$. - -The equation is: - -$$erfinv(x) = {ndtri({x \over 2} + 0.5)} \over {\sqrt{2}}$$ - -The input `X` can carry the LoD (Level of Details) information, -or not. And the output shares the LoD information with input `X`. -)DOC"); - } -}; - -class ErfinvGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out")); - } -}; - -template -class ErfinvGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - void Apply(GradOpPtr op) const override { - op->SetType("erfinv_grad"); - op->SetInput("Out", this->Output("Out")); - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op->SetAttrMap(this->Attrs()); - } -}; - -DECLARE_INPLACE_OP_INFERER(ErfinvInplaceInferer, {"X", "Out"}); - -} // namespace operators -} // namespace paddle - -DECLARE_INFER_SHAPE_FUNCTOR(erfinv, - ErfinvInferShapeFunctor, - PD_INFER_META(phi::UnchangedInferMeta)); - -REGISTER_OPERATOR( - erfinv, - paddle::operators::ErfinvOp, - paddle::operators::ErfinvOpMaker, - paddle::operators::ErfinvGradMaker, - paddle::operators::ErfinvGradMaker, - paddle::operators::ErfinvInplaceInferer, - ErfinvInferShapeFunctor); - -REGISTER_OPERATOR(erfinv_grad, paddle::operators::ErfinvGradOp); diff --git a/paddle/fluid/operators/fill_any_op.cc b/paddle/fluid/operators/fill_any_op.cc index 853ebbdd9e57c..1af302d1fc032 100644 --- a/paddle/fluid/operators/fill_any_op.cc +++ b/paddle/fluid/operators/fill_any_op.cc @@ -12,7 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/fill_any_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { namespace operators { @@ -34,30 +38,11 @@ class FillAnyOpMaker : public framework::OpProtoAndCheckerMaker { class FillAnyOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *context) const override { - OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "FillAny"); - OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "FillAny"); - auto x_dims = context->GetInputDim("X"); - context->SetOutputDim("Out", x_dims); - } }; class FillAnyGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - "Out@GRAD", - "mul"); - auto x_dims = ctx->GetInputDim(framework::GradVarName("Out")); - auto x_grad_name = framework::GradVarName("X"); - if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); - } - } }; template @@ -82,31 +67,22 @@ DECLARE_INPLACE_OP_INFERER(FillAnyGradInplaceInferer, } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(fill_any, + FillInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(fill_any_grad, + FillAnyInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); + REGISTER_OPERATOR(fill_any, ops::FillAnyOp, ops::FillAnyOpMaker, ops::FillAnyGradOpMaker, ops::FillAnyGradOpMaker, - ops::FillAnyOpInplaceInferer); + ops::FillAnyOpInplaceInferer, + FillInferShapeFunctor); REGISTER_OPERATOR(fill_any_grad, ops::FillAnyGradOp, - ops::FillAnyGradInplaceInferer); - -REGISTER_OP_CPU_KERNEL( - fill_any, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel); - -REGISTER_OP_CPU_KERNEL( - fill_any_grad, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel); + ops::FillAnyGradInplaceInferer, + FillAnyInferShapeFunctor); diff --git a/paddle/fluid/operators/fill_any_op.cu.cc b/paddle/fluid/operators/fill_any_op.cu.cc deleted file mode 100644 index 2a561e6d3500e..0000000000000 --- a/paddle/fluid/operators/fill_any_op.cu.cc +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/fill_any_op.h" -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - fill_any, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel); - -REGISTER_OP_CUDA_KERNEL( - fill_any_grad, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel); diff --git a/paddle/fluid/operators/fill_any_op.h b/paddle/fluid/operators/fill_any_op.h deleted file mode 100644 index 4f59d4f6ec659..0000000000000 --- a/paddle/fluid/operators/fill_any_op.h +++ /dev/null @@ -1,67 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class FillAnyKernel : public framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext &ctx) const override { - auto *out = ctx.Output("Out"); - auto floatvar = ctx.template Attr("value_float"); - auto intvar = ctx.template Attr("value_int"); - auto isfloat = ((typeid(float) == typeid(T)) || - (typeid(double) == typeid(T) || - typeid(paddle::platform::float16) == typeid(T))); - - T fill_var = static_cast(floatvar); - if (!isfloat) { - fill_var = static_cast(intvar); - } - - PADDLE_ENFORCE_EQ( - std::isnan(static_cast(fill_var)), - false, - platform::errors::InvalidArgument("fill value should not be NaN," - " but received NaN")); - - out->mutable_data(ctx.GetPlace()); - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant functor; - functor(reinterpret_cast(dev_ctx), - out, - static_cast(fill_var)); - } -}; - -template -class FillAnyGradKernel : public framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext &ctx) const override { - auto *dx = ctx.Output(framework::GradVarName("X")); - if (dx) { - dx->mutable_data(ctx.GetPlace()); - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant functor; - functor(reinterpret_cast(dev_ctx), dx, T(0)); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/fill_diagonal_tensor_op.cc b/paddle/fluid/operators/fill_diagonal_tensor_op.cc index d2e248cffd44c..ccf9b7aa35938 100644 --- a/paddle/fluid/operators/fill_diagonal_tensor_op.cc +++ b/paddle/fluid/operators/fill_diagonal_tensor_op.cc @@ -12,64 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/fill_diagonal_tensor_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { -// calculate the offset\new_dims\(strides of dim1/dim2)\matoffset -void CalMatDims(framework::DDim out_dims, - int dim1, - int dim2, - int64_t *offset, - int64_t *new_dims, - int64_t *strides, - int64_t *matoffset) { - int64_t dimprod = 1, batchdim = 1; - int rank = out_dims.size(); - int matoffidx = 0; - for (int i = rank - 1; i >= 0; i--) { - if (i == dim2) { - strides[0] = dimprod; - } else if (i == dim1) { - strides[1] = dimprod; - } else { - batchdim *= out_dims[i]; - // matoffset calculate the offset position of the diagonal defined by dim1 - // and dim2 - // the first circle calculate the final free dimension - // and then calculate the front free dim one by one - if (matoffidx == 0) { - for (int64_t j = 0; j < out_dims[i]; j++) { - matoffset[matoffidx] = dimprod * j; - matoffidx++; - } - } else { - auto size = matoffidx; - for (int64_t j = 1; j < out_dims[i]; j++) { - for (int64_t k = 0; k < size; k++) { - matoffset[matoffidx] = matoffset[k] + dimprod * j; - matoffidx++; - } - } - } - } - dimprod *= out_dims[i]; - } - - auto diagdim = dim1; - if (*offset >= 0) { - diagdim = std::min(out_dims[dim1], out_dims[dim2] - *offset); - *offset *= strides[0]; - } else { - diagdim = std::min(out_dims[dim1] + *offset, out_dims[dim2]); - *offset *= -strides[1]; - } - new_dims[0] = batchdim; - new_dims[1] = diagdim; - return; -} - class FillDiagonalTensorOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -97,14 +47,6 @@ class FillDiagonalTensorOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *context) const override { - OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "FillDiagonalTensor"); - OP_INOUT_CHECK( - context->HasOutput("Out"), "Output", "Out", "FillDiagonalTensor"); - auto x_dims = context->GetInputDim("X"); - context->SetOutputDim("Out", x_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -124,77 +66,10 @@ class FillDiagonalTensorOpVarTypeInference } }; -template -class FillDiagonalTensorKernel : public framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext &ctx) const override { - auto *out = ctx.Output("Out"); - auto *srctensor = ctx.Input("Y"); - auto dim1 = ctx.Attr("dim1"); - auto dim2 = ctx.Attr("dim2"); - auto offset = ctx.Attr("offset"); - auto *xin = ctx.Input("X"); - - T *out_data = out->mutable_data(ctx.GetPlace()); - const T *fill_data = srctensor->data(); - - framework::TensorCopy(*xin, ctx.GetPlace(), out); - auto out_dims = out->dims(); - auto matdims = srctensor->dims(); - auto fill_dims = phi::flatten_to_2d(matdims, matdims.size() - 1); - - int64_t new_dims[2], strides[2]; - std::vector matdim; - matdim.resize(fill_dims[0]); - CalMatDims(out_dims, dim1, dim2, &offset, new_dims, strides, matdim.data()); - PADDLE_ENFORCE_EQ( - new_dims[0], - fill_dims[0], - platform::errors::InvalidArgument("The dims should be %d x %d, but get " - "%d x %d in fill tensor Y", - new_dims[0], - new_dims[1], - fill_dims[0], - fill_dims[1])); - PADDLE_ENFORCE_EQ( - new_dims[1], - fill_dims[1], - platform::errors::InvalidArgument("The dims should be %d x %d, but get " - "%d x %d in fill tensor Y", - new_dims[0], - new_dims[1], - fill_dims[0], - fill_dims[1])); - - auto size = out->numel(); - for (int64_t i = 0; i < fill_dims[0]; i += 1) { - auto sumoff = matdim[i] + offset; - for (int64_t j = 0; j < fill_dims[1]; j += 1) { - auto fill_index = j * (strides[1] + strides[0]) + sumoff; - if (fill_index < size) { - out_data[fill_index] = fill_data[i * fill_dims[1] + j]; - } - } - } - } -}; - class FillDiagonalTensorGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - "Out@GRAD", - "mul"); - auto x_dims = ctx->GetInputDim(framework::GradVarName("Out")); - auto x_grad_name = framework::GradVarName("X"); - if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); - } - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { // Note: don't get data type from ctx.Input("Input"); @@ -219,50 +94,6 @@ class FillDiagonalTensorGradOpMaker : public framework::SingleGradOpMaker { } }; -template -class FillDiagonalTensorGradKernel : public framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext &ctx) const override { - auto *dx = ctx.Output(framework::GradVarName("X")); - auto *dout = ctx.Input(framework::GradVarName("Out")); - - auto dim1 = ctx.Attr("dim1"); - auto dim2 = ctx.Attr("dim2"); - auto offset = ctx.Attr("offset"); - auto matrows = 1; - - if (dx) { - auto *data = dx->mutable_data(ctx.GetPlace()); - - auto dx_dims = dx->dims(); - for (int i = 0; i < dx_dims.size(); i++) { - if (i != dim1 && i != dim2) { - matrows *= dx_dims[i]; - } - } - - int64_t new_dims[2], strides[2]; - std::vector matdim; - matdim.resize(matrows); - CalMatDims( - dx_dims, dim1, dim2, &offset, new_dims, strides, matdim.data()); - - auto size = dx->numel(); - framework::TensorCopy(*dout, ctx.GetPlace(), dx); - - for (int64_t i = 0; i < new_dims[0]; i += 1) { - auto sumoff = matdim[i] + offset; - for (int64_t j = 0; j < new_dims[1]; j += 1) { - auto fill_index = j * (strides[1] + strides[0]) + sumoff; - if (fill_index < size) { - data[fill_index] = 0; - } - } - } - } - } -}; - DECLARE_INPLACE_OP_INFERER(FillDiagonalTensorOpInplaceInferer, {"X", "Out"}); DECLARE_INPLACE_OP_INFERER(FillDiagonalTensorGradOpInplaceInferer, {framework::GradVarName("Out"), @@ -272,41 +103,25 @@ DECLARE_INPLACE_OP_INFERER(FillDiagonalTensorGradOpInplaceInferer, } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(fill_diagonal_tensor, + FillDiagonalTensorInferShapeFunctor, + PD_INFER_META(phi::FillDiagonalTensorInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR( + fill_diagonal_tensor_grad, + FillDiagonalTensorGradInferShapeFunctor, + PD_INFER_META(phi::FillDiagonalTensorGradInferMeta)); + REGISTER_OPERATOR( fill_diagonal_tensor, ops::FillDiagonalTensorOp, - ops::FillDiagonalTensorOpMaker, - ops::FillDiagonalTensorOpVarTypeInference, ops::FillDiagonalTensorGradOpMaker, ops::FillDiagonalTensorGradOpMaker, - ops::FillDiagonalTensorOpInplaceInferer); + ops::FillDiagonalTensorOpMaker, + ops::FillDiagonalTensorOpInplaceInferer, + ops::FillDiagonalTensorOpVarTypeInference, + FillDiagonalTensorInferShapeFunctor); REGISTER_OPERATOR(fill_diagonal_tensor_grad, ops::FillDiagonalTensorGradOp, - ops::FillDiagonalTensorGradOpInplaceInferer); - -REGISTER_OP_CPU_KERNEL( - fill_diagonal_tensor, - ops::FillDiagonalTensorKernel, - ops::FillDiagonalTensorKernel, - ops::FillDiagonalTensorKernel, - ops::FillDiagonalTensorKernel, - ops::FillDiagonalTensorKernel, - ops::FillDiagonalTensorKernel, - ops::FillDiagonalTensorKernel, - ops::FillDiagonalTensorKernel>, - ops::FillDiagonalTensorKernel>, - ops::FillDiagonalTensorKernel); - -REGISTER_OP_CPU_KERNEL( - fill_diagonal_tensor_grad, - ops::FillDiagonalTensorGradKernel, - ops::FillDiagonalTensorGradKernel, - ops::FillDiagonalTensorGradKernel, - ops::FillDiagonalTensorGradKernel, - ops::FillDiagonalTensorGradKernel, - ops::FillDiagonalTensorGradKernel, - ops::FillDiagonalTensorGradKernel, - ops::FillDiagonalTensorGradKernel>, - ops::FillDiagonalTensorGradKernel>, - ops::FillDiagonalTensorGradKernel); + ops::FillDiagonalTensorGradOpInplaceInferer, + FillDiagonalTensorGradInferShapeFunctor); diff --git a/paddle/fluid/operators/fill_diagonal_tensor_op.cu b/paddle/fluid/operators/fill_diagonal_tensor_op.cu deleted file mode 100644 index 1b6ab71386b3b..0000000000000 --- a/paddle/fluid/operators/fill_diagonal_tensor_op.cu +++ /dev/null @@ -1,231 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/fill_diagonal_tensor_op.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -__global__ void fill_diagonal_tensor_kernel(int64_t size, - T *out_data, - const T *fill_data, - int64_t *strides, - int64_t *matdim, - int64_t offset, - int64_t fill_dims0, - int64_t fill_dims1) { - int64_t i = blockIdx.x; - auto sumoff = matdim[i] + offset; - for (int64_t j = threadIdx.x; j < fill_dims1; j += blockDim.x) { - auto fill_index = j * (strides[1] + strides[0]) + sumoff; - if (fill_index < size) { - out_data[fill_index] = fill_data[i * fill_dims1 + j]; - } - } -} - -template -__global__ void fill_grad_kernel(int64_t size, - T *out_data, - int64_t *strides, - int64_t *matdim, - int64_t offset, - int64_t fill_dims0, - int64_t fill_dims1) { - int64_t i = blockIdx.x; - auto sumoff = matdim[i] + offset; - for (int64_t j = threadIdx.x; j < fill_dims1; j += blockDim.x) { - auto fill_index = j * (strides[1] + strides[0]) + sumoff; - if (fill_index < size) { - out_data[fill_index] = T(0); - } - } -} - -template -class FillDiagonalTensorCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { -#ifdef __HIPCC__ - const int64_t kMaxBlockDim = 256; -#else - const int64_t kMaxBlockDim = 512; -#endif - auto *out = ctx.Output("Out"); - auto *srctensor = ctx.Input("Y"); - auto dim1 = ctx.Attr("dim1"); - auto dim2 = ctx.Attr("dim2"); - auto offset = ctx.Attr("offset"); - - auto *xin = ctx.Input("X"); - framework::TensorCopy(*xin, ctx.GetPlace(), out); - - T *out_data = out->mutable_data(ctx.GetPlace()); - const T *fill_data = srctensor->data(); - - auto out_dims = out->dims(); - auto matdims = srctensor->dims(); - auto fill_dims = phi::flatten_to_2d(matdims, matdims.size() - 1); - - int64_t new_dims[2]; - std::vector memory_block; - memory_block.resize(2 + fill_dims[0]); - int64_t *strides = &(memory_block[0]); - int64_t *matdim = &(memory_block[2]); - CalMatDims(out_dims, dim1, dim2, &offset, new_dims, strides, matdim); - PADDLE_ENFORCE_EQ( - new_dims[0], - fill_dims[0], - platform::errors::InvalidArgument("The dims should be %d x %d, but get " - "%d x %d in fill tensor Y", - new_dims[0], - new_dims[1], - fill_dims[0], - fill_dims[1])); - PADDLE_ENFORCE_EQ( - new_dims[1], - fill_dims[1], - platform::errors::InvalidArgument("The dims should be %d x %d, but get " - "%d x %d in fill tensor Y", - new_dims[0], - new_dims[1], - fill_dims[0], - fill_dims[1])); - - auto size = out->numel(); - - auto &dev_ctx = ctx.template device_context(); - auto stream = dev_ctx.stream(); - Tensor tensor_tmp; - int64_t *memory_block_cu = - tensor_tmp.mutable_data({2 + fill_dims[0]}, ctx.GetPlace()); - const auto gpu_place = ctx.GetPlace(); - memory::Copy(gpu_place, - memory_block_cu, - platform::CPUPlace(), - memory_block.data(), - sizeof(int64_t) * (2 + fill_dims[0]), - stream); - - int64_t *strides_cu = &memory_block_cu[0], *matdim_cu = &memory_block_cu[2]; - - auto kGridDim = new_dims[0]; - auto kBlockDim = std::min(int64_t(new_dims[1]), kMaxBlockDim); - fill_diagonal_tensor_kernel - <<>>(size, - out_data, - fill_data, - strides_cu, - matdim_cu, - offset, - fill_dims[0], - fill_dims[1]); - } -}; - -template -class FillDiagonalTensorGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { -#ifdef __HIPCC__ - const int64_t kMaxBlockDim = 256; -#else - const int64_t kMaxBlockDim = 512; -#endif - auto *dx = ctx.Output(framework::GradVarName("X")); - auto *dout = ctx.Input(framework::GradVarName("Out")); - - auto dim1 = ctx.Attr("dim1"); - auto dim2 = ctx.Attr("dim2"); - auto offset = ctx.Attr("offset"); - auto matrows = 1; - - if (dx) { - auto *data = dx->mutable_data(ctx.GetPlace()); - auto dx_dims = dx->dims(); - framework::TensorCopy(*dout, ctx.GetPlace(), dx); - - for (int i = 0; i < dx_dims.size(); i++) { - if (i != dim1 && i != dim2) { - matrows *= dx_dims[i]; - } - } - - int64_t new_dims[2]; - std::vector memory_block; - memory_block.resize(2 + matrows); - int64_t *strides = &memory_block[0]; - int64_t *matdim = &memory_block[2]; - CalMatDims(dx_dims, dim1, dim2, &offset, new_dims, strides, matdim); - - auto size = dx->numel(); - - auto &dev_ctx = ctx.template device_context(); - auto stream = dev_ctx.stream(); - Tensor tensor_tmp; - int64_t *memory_block_cu = - tensor_tmp.mutable_data({2 + matrows}, ctx.GetPlace()); - const auto gpu_place = ctx.GetPlace(); - memory::Copy(gpu_place, - memory_block_cu, - platform::CPUPlace(), - memory_block.data(), - sizeof(int64_t) * (2 + matrows), - stream); - - int64_t *strides_cu = &memory_block_cu[0], - *matdim_cu = &memory_block_cu[2]; - - auto kGridDim = new_dims[0]; - auto kBlockDim = std::min(int64_t(new_dims[1]), kMaxBlockDim); - fill_grad_kernel<<>>( - size, data, strides_cu, matdim_cu, offset, new_dims[0], new_dims[1]); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - fill_diagonal_tensor, - ops::FillDiagonalTensorCUDAKernel, - ops::FillDiagonalTensorCUDAKernel, - ops::FillDiagonalTensorCUDAKernel, - ops::FillDiagonalTensorCUDAKernel, - ops::FillDiagonalTensorCUDAKernel, - ops::FillDiagonalTensorCUDAKernel, - ops::FillDiagonalTensorCUDAKernel, - ops::FillDiagonalTensorCUDAKernel>, - ops::FillDiagonalTensorCUDAKernel>, - ops::FillDiagonalTensorCUDAKernel); - -REGISTER_OP_CUDA_KERNEL( - fill_diagonal_tensor_grad, - ops::FillDiagonalTensorGradCUDAKernel, - ops::FillDiagonalTensorGradCUDAKernel, - ops::FillDiagonalTensorGradCUDAKernel, - ops::FillDiagonalTensorGradCUDAKernel, - ops::FillDiagonalTensorGradCUDAKernel, - ops::FillDiagonalTensorGradCUDAKernel, - ops::FillDiagonalTensorGradCUDAKernel, - ops::FillDiagonalTensorGradCUDAKernel>, - ops::FillDiagonalTensorGradCUDAKernel>, - ops::FillDiagonalTensorGradCUDAKernel); diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc index 5ec5a93ada46d..149d2bdac3c02 100644 --- a/paddle/fluid/operators/fold_op.cc +++ b/paddle/fluid/operators/fold_op.cc @@ -12,7 +12,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/fold_op.h" +#include +#include + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,216 +26,6 @@ namespace operators { class FoldOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), - true, - platform::errors::NotFound("Input(X) of FoldOp should not be null")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Y"), - true, - platform::errors::NotFound("Output(Y) of FoldOp should not be null")); - auto in_dims = ctx->GetInputDim("X"); - std::vector output_sizes = - ctx->Attrs().Get>("output_sizes"); - std::vector kernel_sizes = - ctx->Attrs().Get>("kernel_sizes"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - std::vector dilations = - ctx->Attrs().Get>("dilations"); - - PADDLE_ENFORCE_EQ( - output_sizes.size(), - 2, - platform::errors::InvalidArgument( - "It is expected output_size equals to 2, but got size %d", - output_sizes.size())); - PADDLE_ENFORCE_EQ( - kernel_sizes.size(), - 2, - platform::errors::InvalidArgument( - "It is expected kernel_size equals to 2, but got size %d", - kernel_sizes.size())); - PADDLE_ENFORCE_EQ( - strides.size(), - 2, - platform::errors::InvalidArgument( - "It is expected strides_size equals to 2, but got size %d", - strides.size())); - PADDLE_ENFORCE_EQ( - paddings.size(), - 4, - platform::errors::InvalidArgument( - "It is expected paddings_size equals to 4, but got size %d", - paddings.size())); - PADDLE_ENFORCE_EQ( - dilations.size(), - 2, - platform::errors::InvalidArgument( - "It is expected dilations_size equals to 2, but got size %d", - dilations.size())); - - int output_height = output_sizes[0]; - int output_width = output_sizes[1]; - int kernel_height = kernel_sizes[0]; - int kernel_width = kernel_sizes[1]; - int dilation_height = dilations[0]; - int dilation_width = dilations[1]; - int stride_height = strides[0]; - int stride_width = strides[1]; - - // check kernel_sizes - PADDLE_ENFORCE_GT(kernel_height, - 0, - platform::errors::InvalidArgument( - "The `kernel_sizes` should be greater than zero, " - "but received kernel_height: %d kernel_width: %d.", - kernel_sizes[0], - kernel_sizes[1])); - PADDLE_ENFORCE_GT(kernel_width, - 0, - platform::errors::InvalidArgument( - "The `kernel_sizes` should be greater than zero, " - "but received kernel_height: %d kernel_width: %d.", - kernel_sizes[0], - kernel_sizes[1])); - // check strides - PADDLE_ENFORCE_GT(stride_height, - 0, - platform::errors::InvalidArgument( - "The `strides` should be greater than zero, " - "but received strides_height: %d strides_width: %d.", - strides[0], - strides[1])); - PADDLE_ENFORCE_GT(stride_width, - 0, - platform::errors::InvalidArgument( - "The `strides` should be greater than zero, " - "but received strides_height: %d strides_width: %d.", - strides[0], - strides[1])); - // check dilations - PADDLE_ENFORCE_GT(output_height, - 1, - platform::errors::InvalidArgument( - "The `output_height` should be greater than one, " - "but received output_height: %d .", - output_height)); - PADDLE_ENFORCE_GT(output_width, - 1, - platform::errors::InvalidArgument( - "The `output_width` should be greater than one, " - "but received output_width: %d .", - output_width)); - // check output size - PADDLE_ENFORCE_GT( - dilation_height, - 0, - platform::errors::InvalidArgument( - "The `dilations` should be greater than zero, " - "but received dilations_height: %d dilations_width: %d.", - dilations[0], - dilations[1])); - PADDLE_ENFORCE_GT( - dilation_width, - 0, - platform::errors::InvalidArgument( - "The `dilations` should be greater than zero, " - "but received dilations_height: %d dilations_width: %d.", - dilations[0], - dilations[1])); - - std::vector out_dims; - // batch_size - out_dims.push_back(in_dims[0]); - // output_plane - int output_channels = in_dims[1] / (kernel_width * kernel_height); - out_dims.push_back(output_channels); - - int blocks_height = (output_sizes[0] + 2 * paddings[0] - - (dilations[0] * (kernel_sizes[0] - 1) + 1)) / - strides[0] + - 1; - int blocks_width = (output_sizes[1] + 2 * paddings[1] - - (dilations[1] * (kernel_sizes[1] - 1) + 1)) / - strides[1] + - 1; - - // check output height and width - PADDLE_ENFORCE_GT( - blocks_height, - 0, - platform::errors::InvalidArgument( - "The sliding blocks calculated from input spatial size (%d, %d), " - "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " - "is (%d, %d), which should be a positive integer.", - in_dims[2], - in_dims[3], - kernel_sizes[0], - kernel_sizes[1], - strides[0], - strides[1], - dilations[0], - dilations[1], - output_height, - output_width)); - - PADDLE_ENFORCE_GT( - blocks_width, - 0, - platform::errors::InvalidArgument( - "The sliding blocks calculated from input spatial size (%d, %d), " - "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " - "is (%d, %d), which should be a positive integer.", - in_dims[2], - in_dims[3], - kernel_sizes[0], - kernel_sizes[1], - strides[0], - strides[1], - dilations[0], - dilations[1], - output_height, - output_width)); - - PADDLE_ENFORCE_EQ( - blocks_height * blocks_width, - in_dims[2], - platform::errors::InvalidArgument( - "Given input output_size (%d, %d), " - "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " - "which should be expected size of input's dimension " - "2 to match the calculated number of %d * %d = %d, but got %d", - output_height, - output_width, - kernel_sizes[0], - kernel_sizes[1], - strides[0], - strides[1], - dilations[0], - dilations[1], - blocks_height, - blocks_width, - blocks_height * blocks_width, - in_dims[2])); - - PADDLE_ENFORCE_EQ( - in_dims[1] % (kernel_sizes[0] * kernel_sizes[1]), - 0, - platform::errors::InvalidArgument( - "Expected size of input's dimension 1 to be divisible by the" - "product of kernel_size, but got input.size(1)=%d and " - "kernel_size=( %d" - ", %d).", - in_dims[1], - kernel_sizes[0], - kernel_sizes[1])); - - out_dims.push_back(output_height); - out_dims.push_back(output_width); - ctx->SetOutputDim("Y", phi::make_ddim(out_dims)); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -286,22 +82,6 @@ class FoldGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput(framework::GradVarName("Y")), - true, - platform::errors::NotFound("The gradient of Y should not be null")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), - true, - platform::errors::NotFound("The input X should not be null")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput(framework::GradVarName("X")), - true, - platform::errors::NotFound("The gradient of X should not be null")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -332,18 +112,19 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(FoldGradOpNoNeedBufferVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(fold, + FoldInferShapeFunctor, + PD_INFER_META(phi::FoldInferMeta)); REGISTER_OPERATOR(fold, ops::FoldOp, ops::FoldOpMaker, ops::FoldGradMaker, - ops::FoldGradMaker); + ops::FoldGradMaker, + FoldInferShapeFunctor); +DECLARE_INFER_SHAPE_FUNCTOR(fold_grad, + FoldGradInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(fold_grad, ops::FoldGradOp, - ops::FoldGradOpNoNeedBufferVarsInferer); - -REGISTER_OP_CPU_KERNEL(fold, - ops::FoldOpKernel, - ops::FoldOpKernel); -REGISTER_OP_CPU_KERNEL(fold_grad, - ops::FoldGradOpKernel, - ops::FoldGradOpKernel); + ops::FoldGradOpNoNeedBufferVarsInferer, + FoldGradInferShapeFunctor); diff --git a/paddle/fluid/operators/fold_op.cu b/paddle/fluid/operators/fold_op.cu deleted file mode 100644 index 7728d57a276af..0000000000000 --- a/paddle/fluid/operators/fold_op.cu +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -Indicesou may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/fold_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL(fold, - ops::FoldOpKernel, - ops::FoldOpKernel); - -REGISTER_OP_CUDA_KERNEL(fold_grad, - ops::FoldGradOpKernel, - ops::FoldGradOpKernel); diff --git a/paddle/fluid/operators/fold_op.h b/paddle/fluid/operators/fold_op.h deleted file mode 100644 index 704e4de1a6942..0000000000000 --- a/paddle/fluid/operators/fold_op.h +++ /dev/null @@ -1,140 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/im2col.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class FoldOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* input = ctx.Input("X"); - const int batch_size = static_cast(input->dims()[0]); - Tensor* output = ctx.Output("Y"); - output->mutable_data(ctx.GetPlace()); - - std::vector output_sizes = ctx.Attr>("output_sizes"); - std::vector kernel_sizes = ctx.Attr>("kernel_sizes"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - - math::Col2ImFunctor col2im; - - auto& dev_ctx = ctx.template device_context(); - - auto input_dims = input->dims(); - - int output_height = (output_sizes[0] + 2 * paddings[0] - - (dilations[0] * (kernel_sizes[0] - 1) + 1)) / - strides[0] + - 1; - int output_width = (output_sizes[1] + 2 * paddings[1] - - (dilations[1] * (kernel_sizes[1] - 1) + 1)) / - strides[1] + - 1; - - int n_input_plane = input_dims[1]; - int n_output_plane = n_input_plane / (kernel_sizes[0] * kernel_sizes[1]); - - framework::DDim output_shape( - {n_output_plane, output_sizes[0], output_sizes[1]}); - - framework::DDim input_matrix_shape({input_dims[0], - kernel_sizes[0], - kernel_sizes[1], - output_height, - output_width}); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, output, static_cast(0)); - - for (int i = 0; i < batch_size; i++) { - Tensor out_batch = - output->Slice(i, i + 1).Resize(output_shape); // im size=3 - Tensor in_batch = - input->Slice(i, i + 1).Resize(input_matrix_shape); // col size=5 - col2im(dev_ctx, in_batch, dilations, strides, paddings, &out_batch); - } - } -}; - -template -class FoldGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* output_grad = ctx.Input(framework::GradVarName("Y")); - Tensor* input_grad = ctx.Output(framework::GradVarName("X")); - input_grad->mutable_data(ctx.GetPlace()); - - if ((!output_grad) || (!input_grad)) return; - - std::vector output_sizes = ctx.Attr>("output_sizes"); - std::vector kernel_sizes = ctx.Attr>("kernel_sizes"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - - const int batch_size = static_cast(input_grad->dims()[0]); - - auto input_dims = input_grad->dims(); - - int output_height = (output_sizes[0] + 2 * paddings[0] - - (dilations[0] * (kernel_sizes[0] - 1) + 1)) / - strides[0] + - 1; - int output_width = (output_sizes[1] + 2 * paddings[1] - - (dilations[1] * (kernel_sizes[1] - 1) + 1)) / - strides[1] + - 1; - - int n_input_plane = input_dims[1]; - int n_output_plane = n_input_plane / (kernel_sizes[0] * kernel_sizes[1]); - - framework::DDim output_shape( - {n_output_plane, output_sizes[0], output_sizes[1]}); - framework::DDim input_matrix_shape({input_dims[0], - kernel_sizes[0], - kernel_sizes[1], - output_height, - output_width}); - - math::Im2ColFunctor im2col; - auto& dev_ctx = ctx.template device_context(); - - for (int i = 0; i < batch_size; i++) { - Tensor out_grad_batch = output_grad->Slice(i, i + 1).Resize(output_shape); - Tensor in_grad_batch = - input_grad->Slice(i, i + 1).Resize(input_matrix_shape); - im2col(dev_ctx, - out_grad_batch, - dilations, - strides, - paddings, - &in_grad_batch); - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu index a858b31e23c8a..6414954667bfe 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu @@ -1279,9 +1279,12 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { auto ffn_ln_scales = ctx.MultiInput("FFNLnScale"); auto ffn_ln_biases = ctx.MultiInput("FFNLnBias"); Tensor bias_dropout_residual_out, dropout_mask_out; - auto *bias_dropout_residual_out_data = - bias_dropout_residual_out.mutable_data({bsz, seq_len, dim_embed}, - place); + T *bias_dropout_residual_out_data = nullptr; + if (pre_layer_norm) { + bias_dropout_residual_out_data = + bias_dropout_residual_out.mutable_data({bsz, seq_len, dim_embed}, + place); + } auto *dropout_mask_out_data = dropout_mask_out.mutable_data( {bsz, seq_len, dim_embed}, place); @@ -1333,14 +1336,19 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { // step1: buf1 --> buf0 // step2: buf0 --> buf1 int layers = qkv_weights.size(); - if (layers & 1) { - // odd, set buf1 as out + if (pre_layer_norm) { + if (layers & 1) { + // odd, set buf1 as out + buf0 = &tmp_out; + buf1 = out; + } else { + // even, set buf0 as out + buf0 = out; + buf1 = &tmp_out; + } + } else { buf0 = &tmp_out; buf1 = out; - } else { - // even, set buf0 as out - buf0 = out; - buf1 = &tmp_out; } for (int i = 0; i < layers; ++i) { @@ -1355,9 +1363,6 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { buf1->data(), ln_mean_data, ln_var_data); - } else if (!pre_layer_norm) { - PADDLE_THROW(platform::errors::Unimplemented( - "Unimplemented post_layer_norm for now.")); } #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER VLOG(0) << "step1"; @@ -1367,8 +1372,13 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { const Tensor *qkv_bias = qkv_biases.size() > 0 ? qkv_biases[i] : nullptr; // NOTE: in decoder stage, bias is fused in fmha const Tensor *bias = time_step ? nullptr : qkv_bias; - qkv_compute.ComputeForward( - qkv_weights[i], buf1, bias, &qkv_out, &qkv_out); + if (!pre_layer_norm && i == 0) { + qkv_compute.ComputeForward( + qkv_weights[i], input_x, bias, &qkv_out, &qkv_out); + } else { + qkv_compute.ComputeForward( + qkv_weights[i], buf1, bias, &qkv_out, &qkv_out); + } #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER VLOG(0) << "step2"; #endif @@ -1451,10 +1461,15 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { VLOG(0) << "step3"; #endif - // step4. out_linear - out_linear_compute.ComputeForward( - out_linear_weights[i], &fmha_out, nullptr, buf1, nullptr); - AllReduce(*buf1, ring_id, dev_ctx); + if (pre_layer_norm) { + out_linear_compute.ComputeForward( + out_linear_weights[i], &fmha_out, nullptr, buf1, nullptr); + AllReduce(*buf1, ring_id, dev_ctx); + } else { + out_linear_compute.ComputeForward( + out_linear_weights[i], &fmha_out, nullptr, buf0, nullptr); + AllReduce(*buf0, ring_id, dev_ctx); + } #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER VLOG(0) << "step4"; #endif @@ -1479,6 +1494,22 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { ln_mean_data, ln_var_data); } else { + auto *ln_scale_data = ln_scales[i]->data(); + auto *ln_bias_data = ln_biases[i]->data(); + auto *out_linear_bias_data = out_linear_biases[i]->data(); + auto *residual_data = (i == 0 ? x_data : buf1->data()); + fused_dropout_layernorm_helper.LayernormResidualDropoutBias( + dev_ctx, + buf0->data(), + residual_data, + out_linear_bias_data, + ln_scale_data, + ln_bias_data, + buf0->data(), + dropout_mask_out_data, + buf1->data(), + ln_mean_data, + ln_var_data); } #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER VLOG(0) << "step5"; @@ -1504,13 +1535,22 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { #endif // step8. ffn matmul2 - ffn2_linear_compute.ComputeForward( - ffn2_weights[i], &ffn1_dropout_out, nullptr, buf1, nullptr); + if (pre_layer_norm) { + ffn2_linear_compute.ComputeForward( + ffn2_weights[i], &ffn1_dropout_out, nullptr, buf1, nullptr); + } else { + ffn2_linear_compute.ComputeForward( + ffn2_weights[i], &ffn1_dropout_out, nullptr, buf0, nullptr); + } #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER VLOG(0) << "step8.0"; #endif - AllReduce(*buf1, ring_id, dev_ctx); + if (pre_layer_norm) { + AllReduce(*buf1, ring_id, dev_ctx); + } else { + AllReduce(*buf0, ring_id, dev_ctx); + } #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER VLOG(0) << "step8.1"; #endif @@ -1543,12 +1583,28 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { dropout_mask_out_data); } } else { + auto *ln_scale_data = ffn_ln_scales[i]->data(); + auto *ln_bias_data = ffn_ln_biases[i]->data(); + ffn2_fused_dropout_helper.LayernormResidualDropoutBias( + dev_ctx, + buf0->data(), + buf1->data(), + ffn2_biases[i]->data(), + ln_scale_data, + ln_bias_data, + buf0->data(), + dropout_mask_out_data, + buf1->data(), + ln_mean_data, + ln_var_data); } #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER VLOG(0) << "step9"; #endif - x_data = buf1->data(); - std::swap(buf0, buf1); + if (pre_layer_norm) { + x_data = buf1->data(); + std::swap(buf0, buf1); + } } } }; diff --git a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h index 56806a8c17340..009a9253ab351 100644 --- a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h +++ b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h @@ -173,7 +173,6 @@ void LaunchFusedSoftmaxMaskKernel(const T* src, dim3 block(warp_size, warps_per_block); dim3 grid(DIV_UP(seq_len, warps_per_block), batch_size, head_num); - // clang-format off int elements = ElementsCeil(seq_len); switch (elements) { case 1: { // <=32 @@ -193,17 +192,16 @@ void LaunchFusedSoftmaxMaskKernel(const T* src, SELECT_SOFTMAX_MASK_KERNEL(4); break; } - CASE_SOFTMAX_MASK_KERNEL(8); // <=256 - CASE_SOFTMAX_MASK_KERNEL(16); // <=512 - CASE_SOFTMAX_MASK_KERNEL(32); // <=1024 - CASE_SOFTMAX_MASK_KERNEL(64); // <=2048 - CASE_SOFTMAX_MASK_KERNEL(128); // <=4096 + CASE_SOFTMAX_MASK_KERNEL(8); // <=256 + CASE_SOFTMAX_MASK_KERNEL(16); // <=512 + CASE_SOFTMAX_MASK_KERNEL(32); // <=1024 + CASE_SOFTMAX_MASK_KERNEL(64); // <=2048 + CASE_SOFTMAX_MASK_KERNEL(128); // <=4096 default: PADDLE_THROW(platform::errors::InvalidArgument( "seq_len must be between (0, 4096], received the seq_len is %d", seq_len)); } - // clang-format on } } // namespace operators diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu index f2d010e16a2ea..f7ffd63d6d2d3 100644 --- a/paddle/fluid/operators/fused/multihead_matmul_op.cu +++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu @@ -256,6 +256,23 @@ __global__ void broadcast(const T *src, } } +template +__global__ void broadcast_batch(const T *src, + T *dst, + const int seq_len, + const int head_num, + const int window_num) { + int WindownumHeadSeqlen_id = blockIdx.x % (window_num * head_num * seq_len); + int dst_offset = blockIdx.x * seq_len; + if (threadIdx.x < seq_len) { + dst[threadIdx.x + dst_offset] = src[threadIdx.x+WindownumHeadSeqlen_id*seq_len]; + } +} +template +__global__ void print_float(const T *src, int index){ + printf("@@@ %f ",src[index]); +} + template class MultiHeadMatMulV2Kernel : public framework::OpKernel { public: @@ -274,7 +291,15 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel { auto *bias_d = bias->data(); auto *bias_qk_d = bias_qk.template data(); T scale = static_cast(context.Attr("alpha")); - + + auto bias_qk_dims=bias_qk.dims(); + int window_num=bias_qk_dims[0]; + printf("@@@@ multihead op \r\n"); + printf("@@@ bias qk dims: %d %d %d %d \r\n",bias_qk_dims[0],bias_qk_dims[1],bias_qk_dims[2],bias_qk_dims[3]); + // print_float<<<1,1>>>(w_d,0); + // print_float<<<1,1>>>(bias_d,0); + print_float<<<1,1>>>(bias_qk_d,0); + printf("\r\n @@@ scale %f: \r\n", scale); int head_number = context.Attr("head_number"); // compute q*k with eltadd auto &device_ctx = context.template device_context(); @@ -286,6 +311,7 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel { int batch = input_dims[0]; int seq_len = input_dims[1]; int hidden = input_dims[2]; + Tensor temp_bias_tensor; // if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted if (bias_qk.numel() == (batch * seq_len)) { @@ -297,6 +323,20 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel { bias_qk_d, temp_qk_bias, seq_len, head_number); bias_qk_d = static_cast(temp_qk_bias); } + // if bias_qk is [window_num,head_number,seq_len,seq_len] + // in swin SW-MSA block dim[0] of input is batch_number*windows_number + // therefore, we broadcast bias_qk to [window_num*originalBatch, head_number, seq_len, seq_len] + if(bias_qk.numel()==(window_num*head_number*seq_len*seq_len)){ + temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len}); + printf("@@@@ type of qk_bias: %s \r\n",__PRETTY_FUNCTION__); + auto *temp_qk_bias = temp_bias_tensor.mutable_data(context.GetPlace()); + int grid = batch * head_number * seq_len; + int block = round_up(seq_len); + broadcast_batch<<>>( + bias_qk_d, temp_qk_bias, seq_len, head_number, window_num); + bias_qk_d = static_cast(temp_qk_bias); + } + int all_head_size = w_dims[2]; int head_size = all_head_size / head_number; diff --git a/paddle/fluid/operators/gather_scatter_kernel.h b/paddle/fluid/operators/gather_scatter_kernel.h index c8a63e4c35a3e..6aa6e4ff7b858 100644 --- a/paddle/fluid/operators/gather_scatter_kernel.h +++ b/paddle/fluid/operators/gather_scatter_kernel.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/float16.h" #pragma once diff --git a/paddle/fluid/operators/graph_sample_neighbors_op.cc b/paddle/fluid/operators/graph_sample_neighbors_op.cc index b44a2823732d8..14f17f77dcb6f 100644 --- a/paddle/fluid/operators/graph_sample_neighbors_op.cc +++ b/paddle/fluid/operators/graph_sample_neighbors_op.cc @@ -50,7 +50,7 @@ class GraphSampleNeighborsOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out_Eids", "The eids of the sample edges"); AddAttr( "sample_size", - "The sample size of graph sample neighbors method. ", + "The sample size of graph sample neighbors method. " "Set default value as -1, means return all neighbors of nodes.") .SetDefault(-1); AddAttr("return_eids", diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc index d9c0ec5171464..e9ba861c3b88b 100644 --- a/paddle/fluid/operators/graph_send_recv_op.cc +++ b/paddle/fluid/operators/graph_send_recv_op.cc @@ -58,6 +58,10 @@ class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker { "The input tensor with data type float32, float64, int32, int64."); AddInput("Src_index", "The source index tensor."); AddInput("Dst_index", "The destination index tensor."); + AddInput("Out_size", + "(Tensor, optional). The 0th dimension of the output." + "It has a higher priority than Attr(out_size).") + .AsDispensable(); AddOutput("Out", "Output tensor of graph_send_recv op."); AddOutput("Dst_count", "Count tensor of Dst_index, mainly for MEAN pool_type.") @@ -68,12 +72,12 @@ class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker { "tensors of Dst_index.") .SetDefault("SUM") .InEnum({"SUM", "MEAN", "MIN", "MAX"}); - AddAttr( + AddAttr>( "out_size", - "(int64_t, default 0)" + "(vector, default {0})" "Define the first dimension of Output tensor." - "If set default 0, then the shape of Out is the same with X.") - .SetDefault(0); + "If set default {0}, then the shape of Out is the same with X.") + .SetDefault({0}); AddComment(R"DOC( Graph Learning Send_Recv combine operator. diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cc b/paddle/fluid/operators/margin_cross_entropy_op.cc index 6ae692260a554..9e9ee9c561159 100644 --- a/paddle/fluid/operators/margin_cross_entropy_op.cc +++ b/paddle/fluid/operators/margin_cross_entropy_op.cc @@ -12,7 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/margin_cross_entropy_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,55 +25,6 @@ class MarginCrossEntropyOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK( - ctx->HasInput("Logits"), "Input", "Logits", "MarginCrossEntropyOp"); - OP_INOUT_CHECK( - ctx->HasInput("Label"), "Input", "Label", "MarginCrossEntropyOp"); - - OP_INOUT_CHECK( - ctx->HasOutput("Softmax"), "Output", "Softmax", "MarginCrossEntropyOp"); - OP_INOUT_CHECK( - ctx->HasOutput("Loss"), "Output", "Loss", "MarginCrossEntropyOp"); - - auto logits_dims = ctx->GetInputDim("Logits"); - auto labels_dims = ctx->GetInputDim("Label"); - - auto logits_rank = logits_dims.size(); - auto axis = logits_rank - 1; - for (int i = 0; i < logits_rank; i++) { - if (i != axis) { - if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) { - PADDLE_ENFORCE_EQ(logits_dims[i], - labels_dims[i], - platform::errors::InvalidArgument( - "Input(Logits) and Input(Label) should in " - "same shape in dimensions except axis.")); - } - } - } - - if (labels_dims.size() > 1) { - PADDLE_ENFORCE_EQ( - labels_dims[logits_rank - 1], - 1UL, - platform::errors::InvalidArgument( - "the last dimension of Input(Label) should be 1." - "But received: the last dimension of Input(Label) is [%d]," - "the last dimension is [%d]", - labels_dims[logits_rank - 1], - logits_rank - 1)); - } - - ctx->SetOutputDim("Softmax", logits_dims); - - logits_dims[axis] = 1; - ctx->SetOutputDim("Loss", logits_dims); - - ctx->ShareLoD("Logits", /*->*/ "Softmax"); - ctx->ShareLoD("Logits", /*->*/ "Loss"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -140,29 +95,6 @@ class MarginCrossEntropyOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Loss")), - true, - platform::errors::InvalidArgument( - "Input(Loss@Grad) should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"), - true, - platform::errors::InvalidArgument( - "Input(Softmax) should be not null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Label"), - true, - platform::errors::InvalidArgument("Input(Label) should be not null.")); - - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Logits")), - true, - platform::errors::InvalidArgument( - "Output(Logits@Grad) should be not null.")); - - ctx->SetOutputDim(framework::GradVarName("Logits"), - ctx->GetInputDim("Softmax")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -194,18 +126,21 @@ class MarginCrossEntropyOpGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -namespace plat = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(margin_cross_entropy, + MarginCrossEntropyInferShapeFunctor, + PD_INFER_META(phi::MarginCrossEntropyInferMeta)); REGISTER_OPERATOR( margin_cross_entropy, ops::MarginCrossEntropyOp, ops::MarginCrossEntropyOpMaker, ops::MarginCrossEntropyOpGradMaker, - ops::MarginCrossEntropyOpGradMaker); - -REGISTER_OPERATOR(margin_cross_entropy_grad, ops::MarginCrossEntropyOpGrad); - -REGISTER_OP_CPU_KERNEL(margin_cross_entropy, - ops::MarginCrossEntropyOpCPUKernel, - ops::MarginCrossEntropyOpCPUKernel, - ops::MarginCrossEntropyOpCPUKernel); + ops::MarginCrossEntropyOpGradMaker, + MarginCrossEntropyInferShapeFunctor); +DECLARE_INFER_SHAPE_FUNCTOR( + margin_cross_entropy_grad, + MarginCrossEntropyGradInferShapeFunctor, + PD_INFER_META(phi::MarginCrossEntropyGradInferMeta)); +REGISTER_OPERATOR(margin_cross_entropy_grad, + ops::MarginCrossEntropyOpGrad, + MarginCrossEntropyGradInferShapeFunctor); diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu deleted file mode 100644 index 6d1ff9f296eb8..0000000000000 --- a/paddle/fluid/operators/margin_cross_entropy_op.cu +++ /dev/null @@ -1,618 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_WITH_HIP -#include -namespace cub = hipcub; -#else -#include -#endif - -#include - -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/margin_cross_entropy_op.h" -#include "paddle/fluid/operators/math/softmax_impl.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -#include "paddle/fluid/string/string_helper.h" -#include "paddle/phi/api/include/tensor.h" -#include "paddle/phi/kernels/funcs/axis_utils.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/distributed/collective/ProcessGroup.h" -#include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/device/gpu/nccl_helper.h" -#endif - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -static constexpr int kNumCUDAThreads = 512; -static constexpr int kNumMaxinumNumBlocks = 4096; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaxinumNumBlocks); -} - -void GetClassInterval(const gpuStream_t& stream, - const platform::Place& place, - const platform::DeviceContext& ctx, - const int rid, - const int rank, - const int nranks, - const int D, - Tensor* class_interval) { - std::vector shard_dim_vec(nranks + 1, 0); - shard_dim_vec[rank + 1] = D; - if (nranks <= 1) { - framework::TensorFromVector(shard_dim_vec, ctx, class_interval); - return; - } - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - Tensor num_classes_per_device; - framework::TensorFromVector(shard_dim_vec, ctx, &num_classes_per_device); - int* num_classes_per_device_ptr = num_classes_per_device.data(); - - auto map = distributed::ProcessGroupMapFromGid::getInstance(); - if (map->has(rid)) { - // Use ProcessGroup - distributed::ProcessGroup* pg = map->get(rid); - std::vector in_tensor; - std::vector out_tensor; - in_tensor.push_back(num_classes_per_device); - out_tensor.push_back(num_classes_per_device); - - distributed::AllreduceOptions opts; - opts.reduce_op = distributed::ReduceOp::SUM; - auto task = pg->AllReduce(in_tensor, out_tensor, opts); - task->Wait(); - } else { - const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place); - // use global calculate stream - const auto calcu_stream = - static_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - num_classes_per_device_ptr, - num_classes_per_device_ptr, - num_classes_per_device.numel(), - platform::ToNCCLDataType( - framework::TransToProtoVarType(num_classes_per_device.dtype())), - ncclSum, - comm->comm(), - calcu_stream)); - } - - auto class_interval_ptr = - class_interval->mutable_data({nranks + 1}, place); - size_t cub_temp_storage_bytes = 0; - cub::DeviceScan::InclusiveSum( - nullptr, cub_temp_storage_bytes, nullptr, nullptr, nranks + 1, stream); - auto cub_temp_storage = memory::Alloc(place, cub_temp_storage_bytes); - cub::DeviceScan::InclusiveSum(cub_temp_storage->ptr(), - cub_temp_storage_bytes, - num_classes_per_device_ptr, - class_interval_ptr, - nranks + 1, - stream); - return; -#endif -} - -template -__global__ void AddMarginToPositiveLogitsKernel(T* logit, - const IndexT* label, - const float margin1, - const float margin2, - const float margin3, - const int rank, - const int nranks, - const int64_t N, - const int64_t D, - const int* class_interval_ptr) { - using MPType = typename details::MPTypeTrait::Type; - int start_index = class_interval_ptr[rank]; - int end_index = class_interval_ptr[rank + 1]; - int num_classes = class_interval_ptr[nranks]; - CUDA_KERNEL_LOOP(i, N) { - auto real_label = label[i]; - PADDLE_ENFORCE((real_label < num_classes) && (real_label >= 0), - "The index is out of bounds, " - "please check whether the value of label and " - "input meet the number of class. It should " - "be less than [%d], but received [%d]", - num_classes, - real_label); - - if (real_label >= start_index && real_label < end_index) { - int64_t offset = i * D + real_label - start_index; - if (fabs(margin1 - 1.0) > 1e-8 || fabs(margin2) > 1e-8) { - MPType x = static_cast(logit[offset]); - MPType theta = acos(x); - if (fabs(margin1 - 1.0) > 1e-8) { - theta *= static_cast(margin1); - } - if (fabs(margin2) > 1e-8) { - theta += static_cast(margin2); - } - logit[offset] = static_cast(cos(theta)); - } - if (fabs(margin3) > 1e-8) { - MPType y = static_cast(logit[offset]); - y -= static_cast(margin3); - logit[offset] = static_cast(y); - } - } - } -} - -template -__global__ void ScaleLogitKernel(T* logits, - const float scale, - const int64_t N, - const int64_t D) { - CUDA_KERNEL_LOOP(i, N * D) { logits[i] *= static_cast(scale); } -} - -template -__global__ void LogitsMinusMaxKernel(T* logits, - const T* logits_max_per_row, - const int64_t N, - const int64_t D) { - CUDA_KERNEL_LOOP(i, N * D) { - auto row = i / D; - logits[i] -= logits_max_per_row[row]; - } -} - -template -__global__ void LogitsMinusLogSumKernel(T* logits, - const T* logits_sum_per_row, - const int64_t N, - const int64_t D) { - CUDA_KERNEL_LOOP(i, N * D) { - auto row = i / D; - logits[i] -= kps::details::Log(logits_sum_per_row[row]); - } -} - -template -__global__ void HardLabelSoftmaxWithCrossEntropyKernel( - T* loss, - T* log_softmax, - const IndexT* labels, - const int rank, - const int64_t N, - const int64_t D, - const int* class_interval_ptr) { - int start_index = class_interval_ptr[rank]; - CUDA_KERNEL_LOOP(i, N * D) { - auto row = i / D; - auto col = i % D; - if ((col + start_index) == labels[row]) { - auto softmax = log_softmax[i]; - loss[row] = -softmax; - log_softmax[i] = kps::details::Exp(softmax); - } else { - log_softmax[i] = kps::details::Exp(log_softmax[i]); - } - } -} - -template -__global__ void CalculateGrad(T* logits_grad, - const T* loss_grad, - const T* logits, - const IndexT* labels, - const float margin1, - const float margin2, - const float scale, - const int rank, - const int64_t N, - const int64_t D, - const int* class_interval_ptr) { - using MPType = typename details::MPTypeTrait::Type; - int start_index = class_interval_ptr[rank]; - CUDA_KERNEL_LOOP(i, N * D) { - auto row = i / D; - auto col = i % D; - if ((col + start_index) == labels[row]) { - logits_grad[i] = (logits_grad[i] - static_cast(1.0)) * loss_grad[row]; - if (fabs(margin1 - 1.0) > 1e-8 || fabs(margin2) > 1e-8) { - MPType dout = static_cast(logits_grad[i]); - MPType one = static_cast(1.0f); - MPType x = static_cast(logits[i]); - MPType m1 = static_cast(margin1); - MPType m2 = static_cast(margin2); - - MPType d = m1 * sin(m1 * acos(x) + m2) / sqrt(one - x * x); - logits_grad[i] = static_cast(dout * d); - } - } else { - logits_grad[i] *= loss_grad[row]; - } - if (fabs(scale - 1.0) > 1e-8) { - logits_grad[i] *= static_cast(scale); - } - } -} - -template -class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* logits = ctx.Input("Logits"); - const Tensor* labels = ctx.Input("Label"); - Tensor* softmax = ctx.Output("Softmax"); - Tensor* loss = ctx.Output("Loss"); - - const int rid = ctx.Attr("ring_id"); - const int nranks = ctx.Attr("nranks"); - const int rank = ctx.Attr("rank"); - - const float margin1 = ctx.Attr("margin1"); - const float margin2 = ctx.Attr("margin2"); - const float margin3 = ctx.Attr("margin3"); - const float scale = ctx.Attr("scale"); - - const auto& place = ctx.GetPlace(); - auto& dev_ctx = ctx.template device_context(); - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - platform::NCCLComm* comm; - distributed::ProcessGroup* pg = nullptr; - gpuStream_t stream; - if (nranks > 1) { - auto map = distributed::ProcessGroupMapFromGid::getInstance(); - if (map->has(rid)) { - // Use ProcessGroup - pg = map->get(rid); - } else { - comm = platform::NCCLCommContext::Instance().Get(rid, place); - - // use global calculate stream - stream = static_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - } - } -#endif - - // allocate memory on device. - T* softmax_ptr = softmax->mutable_data(place); - T* loss_ptr = loss->mutable_data(place); - - const auto& logits_dims = logits->dims(); - const auto& labels_dims = labels->dims(); - - const int axis = logits_dims.size() - 1; - const int N = phi::funcs::SizeToAxis(axis, logits_dims); - const int D = phi::funcs::SizeFromAxis(axis, logits_dims); - - int blocks = NumBlocks(N); - int threads = kNumCUDAThreads; - const auto& label_type = framework::TransToProtoVarType(labels->dtype()); - - // copy logits to softmax variable since we can't modify logits, - // and it also be used when calculate grad - framework::TensorCopy( - *logits, ctx.GetPlace(), ctx.device_context(), softmax); - - Tensor softmax_2d; - softmax_2d.ShareDataWith(*softmax).Resize({N, D}); - T* logits_ptr = softmax_2d.data(); - - Tensor class_interval; - GetClassInterval(dev_ctx.stream(), - place, - ctx.cuda_device_context(), - rid, - rank, - nranks, - D, - &class_interval); - - // step 1, preprocess logits - // add margin for positive elements - // theta = acos(x_i) - // (cos(m1 * theta + m2) - m3) - // save match_logits, used for gradient computation. - if (label_type == framework::proto::VarType::INT32) { - typedef int32_t LabelT; - AddMarginToPositiveLogitsKernel - <<>>( - logits_ptr, - labels->data(), - margin1, - margin2, - margin3, - rank, - nranks, - N, - D, - class_interval.data()); - } else if (label_type == framework::proto::VarType::INT64) { - typedef int64_t LabelT; - AddMarginToPositiveLogitsKernel - <<>>( - logits_ptr, - labels->data(), - margin1, - margin2, - margin3, - rank, - nranks, - N, - D, - class_interval.data()); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "margin_cross_entropy label type noly support int32 and int64, " - "but got %s", - label_type)); - } - - // scale by s - ScaleLogitKernel<<>>( - logits_ptr, scale, N, D); - - // step 2, obtain logit_max - Tensor logits_max; - logits_max = ctx.AllocateTmpTensor({N, 1}, dev_ctx); - T* logits_max_buff = logits_max.mutable_data(place); - TensorReduceImpl>( - dev_ctx, - softmax_2d, - &logits_max, - kps::IdentityFunctor(), - {1}, - dev_ctx.stream()); - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - if (nranks > 1) { - if (pg) { - std::vector in_tensor; - std::vector out_tensor; - in_tensor.push_back(logits_max); - out_tensor.push_back(logits_max); - - distributed::AllreduceOptions opts; - opts.reduce_op = distributed::ReduceOp::MAX; - auto task = pg->AllReduce(in_tensor, out_tensor, opts); - task->Wait(); - } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - logits_max_buff, - logits_max_buff, - logits_max.numel(), - platform::ToNCCLDataType( - framework::TransToProtoVarType(logits_max.dtype())), - ncclMax, - comm->comm(), - stream)); - } - } -#endif - - // step 3, logit - logit_max - LogitsMinusMaxKernel<<>>( - logits_ptr, logits_max_buff, N, D); - - // step 4, sum(exp(logit - logit_max)) - Tensor sum_exp_logits; - sum_exp_logits = ctx.AllocateTmpTensor({N, 1}, dev_ctx); - T* sum_exp_logits_buff = sum_exp_logits.mutable_data(place); - TensorReduceImpl>( - dev_ctx, - softmax_2d, - &sum_exp_logits, - kps::ExpFunctor(), - {1}, - dev_ctx.stream()); - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - if (nranks > 1) { - if (pg) { - std::vector in_tensor; - std::vector out_tensor; - in_tensor.push_back(sum_exp_logits); - out_tensor.push_back(sum_exp_logits); - - distributed::AllreduceOptions opts; - opts.reduce_op = distributed::ReduceOp::SUM; - auto task = pg->AllReduce(in_tensor, out_tensor, opts); - task->Wait(); - } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - sum_exp_logits_buff, - sum_exp_logits_buff, - sum_exp_logits.numel(), - platform::ToNCCLDataType( - framework::TransToProtoVarType(sum_exp_logits.dtype())), - ncclSum, - comm->comm(), - stream)); - } - } -#endif - - // step 5, (logit - logit_max) - log(sum(exp(logit - logit_max))) - LogitsMinusLogSumKernel - <<>>( - logits_ptr, sum_exp_logits_buff, N, D); - - // step 6, prob = exp((logit - logit_max) - log(sum(exp(logit - - // logit_max)))) - // loss = -((logit_i - logit_max) - log(sum(exp(logit - logit_max)))) - phi::funcs::SetConstant()( - dev_ctx, loss, static_cast(0.0)); - if (label_type == framework::proto::VarType::INT32) { - typedef int32_t LabelT; - HardLabelSoftmaxWithCrossEntropyKernel - <<>>( - loss_ptr, - logits_ptr, - labels->data(), - rank, - N, - D, - class_interval.data()); - } else if (label_type == framework::proto::VarType::INT64) { - typedef int64_t LabelT; - HardLabelSoftmaxWithCrossEntropyKernel - <<>>( - loss_ptr, - logits_ptr, - labels->data(), - rank, - N, - D, - class_interval.data()); - } - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - if (nranks > 1) { - if (pg) { - std::vector in_tensor; - std::vector out_tensor; - in_tensor.push_back(*loss); - out_tensor.push_back(*loss); - - distributed::AllreduceOptions opts; - opts.reduce_op = distributed::ReduceOp::SUM; - auto task = pg->AllReduce(in_tensor, out_tensor, opts); - task->Wait(); - } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - loss_ptr, - loss_ptr, - loss->numel(), - platform::ToNCCLDataType( - framework::TransToProtoVarType(loss->dtype())), - ncclSum, - comm->comm(), - stream)); - } - } -#endif - } -}; - -template -class MarginCrossEntropyGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* labels = context.Input("Label"); - const Tensor* logits = context.Input("Logits"); - const Tensor* softmax = context.Input("Softmax"); - - const Tensor* loss_grad = - context.Input(framework::GradVarName("Loss")); - Tensor* logit_grad = - context.Output(framework::GradVarName("Logits")); - - const bool return_softmax = context.Attr("return_softmax"); - - const int rid = context.Attr("ring_id"); - const int nranks = context.Attr("nranks"); - const int rank = context.Attr("rank"); - - const float margin1 = context.Attr("margin1"); - const float margin2 = context.Attr("margin2"); - const float margin3 = context.Attr("margin3"); - const float scale = context.Attr("scale"); - - auto& dev_ctx = context.template device_context(); - - const auto sofrmax_dims = softmax->dims(); - const int axis = sofrmax_dims.size() - 1; - const int N = phi::funcs::SizeToAxis(axis, sofrmax_dims); - const int D = phi::funcs::SizeFromAxis(axis, sofrmax_dims); - - if (return_softmax) { - framework::TensorCopy( - *softmax, context.GetPlace(), context.device_context(), logit_grad); - } else { - logit_grad->ShareDataWith(*softmax); - } - - int blocks = NumBlocks(N * D); - int threads = kNumCUDAThreads; - const auto& label_type = framework::TransToProtoVarType(labels->dtype()); - - Tensor class_interval; - GetClassInterval(dev_ctx.stream(), - context.GetPlace(), - context.cuda_device_context(), - rid, - rank, - nranks, - D, - &class_interval); - - if (label_type == framework::proto::VarType::INT32) { - typedef int32_t LabelT; - CalculateGrad<<>>( - logit_grad->data(), - loss_grad->data(), - logits->data(), - labels->data(), - margin1, - margin2, - scale, - rank, - N, - D, - class_interval.data()); - } else if (label_type == framework::proto::VarType::INT64) { - typedef int64_t LabelT; - CalculateGrad<<>>( - logit_grad->data(), - loss_grad->data(), - logits->data(), - labels->data(), - margin1, - margin2, - scale, - rank, - N, - D, - class_interval.data()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(margin_cross_entropy, - ops::MarginCrossEntropyOpCUDAKernel, - ops::MarginCrossEntropyOpCUDAKernel, - ops::MarginCrossEntropyOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL(margin_cross_entropy_grad, - ops::MarginCrossEntropyGradCUDAKernel, - ops::MarginCrossEntropyGradCUDAKernel, - ops::MarginCrossEntropyGradCUDAKernel); diff --git a/paddle/fluid/operators/margin_cross_entropy_op.h b/paddle/fluid/operators/margin_cross_entropy_op.h deleted file mode 100644 index 9261c84c8552c..0000000000000 --- a/paddle/fluid/operators/margin_cross_entropy_op.h +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/softmax.h" - -namespace paddle { -namespace operators { - -template -class MarginCrossEntropyOpCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_THROW(platform::errors::Unavailable( - "Do not support margin_cross_entropy for cpu kernel " - "now.")); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu index 42a54195defd7..7cbafe4accdee 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.cu +++ b/paddle/fluid/operators/math/bert_encoder_functor.cu @@ -547,7 +547,8 @@ inline void MatMulWithHeadQK(const phi::GPUContext &context, T beta) { CBLAS_TRANSPOSE transA = !q_trans ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = !k_trans ? CblasNoTrans : CblasTrans; - + printf("@@ MatMulWithHeadQK: batch_size:%d, head_num:%d, seq_len:%d\r\n", + batch_size,head_num,seq_len); typedef typename CUDATypeTraits::TYPE run_type; auto blas = phi::funcs::GetBlas(context); auto stream = context.stream(); diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h index 8cb7fad48e9de..0de10789ba02e 100644 --- a/paddle/fluid/operators/math/cross_entropy.h +++ b/paddle/fluid/operators/math/cross_entropy.h @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/float16.h" #include "paddle/phi/core/hostdevice.h" namespace paddle { diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h index 1e8fb983a9499..ad4d3489c21fe 100644 --- a/paddle/fluid/operators/math/sample_prob.h +++ b/paddle/fluid/operators/math/sample_prob.h @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/math/sampler.h" +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/ddim.h" namespace paddle { diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 18cd3e7261dd7..7cf7b25233550 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -19,7 +19,11 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/unpooling.h b/paddle/fluid/operators/math/unpooling.h index 30b6db78f31b6..1b0f52dacd970 100644 --- a/paddle/fluid/operators/math/unpooling.h +++ b/paddle/fluid/operators/math/unpooling.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc index 36ce3e6474254..680cd6a344579 100644 --- a/paddle/fluid/operators/math/vol2col.cc +++ b/paddle/fluid/operators/math/vol2col.cc @@ -50,18 +50,18 @@ class Vol2ColFunctor { int input_channels = (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]); - int input_depth = + int64_t input_depth = (data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]); - int input_height = + int64_t input_height = (data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]); - int input_width = + int64_t input_width = (data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]); int filter_depth = col->dims()[1]; int filter_height = col->dims()[2]; int filter_width = col->dims()[3]; - int output_depth = col->dims()[4]; - int output_height = col->dims()[5]; - int output_width = col->dims()[6]; + int64_t output_depth = col->dims()[4]; + int64_t output_height = col->dims()[5]; + int64_t output_width = col->dims()[6]; int channels_col = input_channels * filter_depth * filter_height * filter_width; @@ -109,22 +109,22 @@ class Vol2ColFunctor { output_width)); const T* vol_data = vol.data(); T* col_data = col->data(); - - for (int c = 0; c < channels_col; ++c) { + for (auto c = 0; c < channels_col; ++c) { int w_offset = c % filter_width; int h_offset = (c / filter_width) % filter_height; int d_offset = (c / filter_width / filter_height) % filter_depth; - int c_in = c / filter_width / filter_height / filter_depth; - for (int d = 0; d < output_depth; ++d) { - int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0]; - for (int h = 0; h < output_height; ++h) { - int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1]; - for (int w = 0; w < output_width; ++w) { - int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2]; + int64_t c_in = c / filter_width / filter_height / filter_depth; + for (auto d = 0; d < output_depth; ++d) { + int64_t d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0]; + for (auto h = 0; h < output_height; ++h) { + int64_t h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1]; + for (auto w = 0; w < output_width; ++w) { + int64_t w_pad = + w * strides[2] - pad_w_left + w_offset * dilations[2]; - int col_idx = + int64_t col_idx = ((c * output_depth + d) * output_height + h) * output_width + w; - int vol_idx; + int64_t vol_idx; if (data_layout != DataLayout::kNHWC) { vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) * input_width + diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index eb0d03ce00a97..6fba33e10ffcf 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -196,100 +196,21 @@ struct SoftplusMKLDNNFunctor : public BaseActivationFunctor { } }; -template -using ReluMKLDNNFunctor = - MKLDNNActivationFunc; - template using Relu6MKLDNNFunctor = MKLDNNActivationFunc; -template -using SwishMKLDNNFunctor = - MKLDNNActivationFunc; - -template -using HardSwishMKLDNNFunctor = - MKLDNNActivationFunc; - -template -using MishMKLDNNFunctor = - MKLDNNActivationFunc; - -template -using SigmoidMKLDNNFunctor = - MKLDNNActivationFunc; - -template -using TanhMKLDNNFunctor = - MKLDNNActivationFunc; - -template -using SqrtMKLDNNFunctor = - MKLDNNActivationFunc; - template using AbsMKLDNNFunctor = MKLDNNActivationFunc; -template -using EluMKLDNNFunctor = MKLDNNActivationFunc; - -template -using ExpMKLDNNFunctor = MKLDNNActivationFunc; - -template -using RoundMKLDNNFunctor = - MKLDNNActivationFunc; - -template -using ReluMKLDNNGradFunctor = - MKLDNNActivationGradFunc; - template using Relu6MKLDNNGradFunctor = MKLDNNActivationGradFunc; -template -using SwishMKLDNNGradFunctor = - MKLDNNActivationGradFunc; - -template -using HardSwishMKLDNNGradFunctor = - MKLDNNActivationGradFunc; - -template -using MishMKLDNNGradFunctor = - MKLDNNActivationGradFunc; - -template -using SigmoidMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< - T, - dnnl::algorithm::eltwise_logistic_use_dst_for_bwd>; - -template -using TanhMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< - T, - dnnl::algorithm::eltwise_tanh_use_dst_for_bwd>; - -template -using SqrtMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< - T, - dnnl::algorithm::eltwise_sqrt_use_dst_for_bwd>; - template using AbsMKLDNNGradFunctor = MKLDNNActivationGradFunc; -template -using EluMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< - T, - dnnl::algorithm::eltwise_elu_use_dst_for_bwd>; - -template -using ExpMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< - T, - dnnl::algorithm::eltwise_exp_use_dst_for_bwd>; - } // namespace operators } // namespace paddle @@ -316,26 +237,13 @@ namespace ops = paddle::operators; ::paddle::platform::CPUPlace, \ ops::MKLDNNActivationKernel>); -#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \ - __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor); \ - __macro(elu, EluMKLDNNFunctor, EluMKLDNNGradUseOutFunctor); \ - __macro(exp, ExpMKLDNNFunctor, ExpMKLDNNGradUseOutFunctor); \ - __macro(gelu, GeluMKLDNNFunctor, GeluMKLDNNGradFunctor); \ - __macro(hard_swish, HardSwishMKLDNNFunctor, HardSwishMKLDNNGradFunctor); \ - __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \ - __macro(mish, MishMKLDNNFunctor, MishMKLDNNGradFunctor); \ - __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \ - __macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor); \ - __macro(sigmoid, SigmoidMKLDNNFunctor, SigmoidMKLDNNGradUseOutFunctor); \ - __macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradUseOutFunctor); \ - __macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor); \ - __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradUseOutFunctor); +#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \ + __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor); \ + __macro(gelu, GeluMKLDNNFunctor, GeluMKLDNNGradFunctor); \ + __macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor); FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL); -// round eltwise primitive doesn't support BF16, nor does it support grad -REGISTER_ACTIVATION_MKLDNN_KERNEL_FWD_ONLY(round, RoundMKLDNNFunctor); - namespace ops = paddle::operators; REGISTER_OP_KERNEL( softplus, diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 8ee97c281e3f4..fc8f29913097c 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -24,13 +24,13 @@ namespace paddle { namespace operators { namespace { -inline MKLDNNMemoryFormat GetWeightsFormat(const MKLDNNMemoryFormat format, - const int groups, +inline MKLDNNMemoryFormat GetWeightsFormat(const int groups, const bool is_conv3d) { if (is_conv3d) { - return (groups == 1) ? format : MKLDNNMemoryFormat::goidhw; + return (groups == 1) ? MKLDNNMemoryFormat::oidhw + : MKLDNNMemoryFormat::goidhw; } else { - return (groups == 1) ? format : MKLDNNMemoryFormat::goihw; + return (groups == 1) ? MKLDNNMemoryFormat::oihw : MKLDNNMemoryFormat::goihw; } } @@ -98,10 +98,6 @@ class ConvMKLDNNHandlerT "The input tensor's layout should be %d, but got %d.", framework::DataLayout::kMKLDNN, input->layout())); - PADDLE_ENFORCE_NE(input->format(), - MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Wrong format set for Input tensor")); PADDLE_ENFORCE_EQ( filter->layout(), @@ -110,10 +106,6 @@ class ConvMKLDNNHandlerT "The Filter tensor's layout should be %d, but got %d.", framework::DataLayout::kMKLDNN, filter->layout())); - PADDLE_ENFORCE_NE(filter->format(), - MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Wrong format set for Filter tensor")); PADDLE_ENFORCE_GE( input->dims().size(), @@ -153,10 +145,6 @@ class ConvMKLDNNHandlerT "The Bias tensor's layout should be %d, but got %d.", framework::DataLayout::kMKLDNN, bias->layout())); - PADDLE_ENFORCE_NE(bias->format(), - MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Bias tensor.")); PADDLE_ENFORCE_EQ(bias->dims().size(), 1, @@ -307,10 +295,6 @@ class ConvMKLDNNHandlerT "The input tensor's layout should be %d, but got %d.", framework::DataLayout::kMKLDNN, in->layout())); - PADDLE_ENFORCE_NE(in->format(), - MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Input tensor.")); PADDLE_ENFORCE_EQ( filter->layout(), @@ -319,10 +303,6 @@ class ConvMKLDNNHandlerT "The filter tensor's layout should be %d, but got %d.", framework::DataLayout::kMKLDNN, filter->layout())); - PADDLE_ENFORCE_NE(filter->format(), - MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Filter tensor.")); PADDLE_ENFORCE_EQ( out_grad->layout(), @@ -331,10 +311,6 @@ class ConvMKLDNNHandlerT "The output_grad tensor's layout should be %d, but got %d.", framework::DataLayout::kMKLDNN, out_grad->layout())); - PADDLE_ENFORCE_NE(out_grad->format(), - MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Wrong format set for output_grad tensor")); PADDLE_ENFORCE_EQ( ctx.Attr("is_test"), @@ -596,10 +572,10 @@ class ConvMKLDNNHandlerT auto weights_tz = phi::vectorize(filter->dims()); platform::GetGroupConvWeightsTz(weights_tz, groups); - auto user_src_md = platform::MKLDNNMemDesc( - weights_tz, - platform::MKLDNNGetDataType(), - GetWeightsFormat(filter->format(), groups, is_conv3d)); + auto user_src_md = + platform::MKLDNNMemDesc(weights_tz, + platform::MKLDNNGetDataType(), + GetWeightsFormat(groups, is_conv3d)); return this->AcquireMemoryWithReorder( user_src_md, @@ -660,12 +636,11 @@ class ConvMKLDNNHandlerT auto user_mem_p = this->AcquireMemory(user_key_suffix); if (!user_mem_p) { - auto user_mem_md = - platform::MKLDNNMemDesc(phi::vectorize(in_mem->dims()), - platform::MKLDNNGetDataType(), - in_mem->format()); return this->AcquireMemoryWithReorder( - user_mem_md, mem_md, platform::to_void_cast(in_mem_data), key_mem); + in_mem->mem_desc(), + mem_md, + platform::to_void_cast(in_mem_data), + key_mem); } else { const std::string target_key_suffix{key_mem_target}; const auto target_mem_p = this->AcquireMemory(target_key_suffix); @@ -694,10 +669,10 @@ class ConvMKLDNNHandlerT auto weights_tz = phi::vectorize(filter->dims()); platform::GetGroupConvWeightsTz(weights_tz, groups); - auto user_src_md = platform::MKLDNNMemDesc( - weights_tz, - platform::MKLDNNGetDataType(), - GetWeightsFormat(filter->format(), groups, is_conv3d)); + auto user_src_md = + platform::MKLDNNMemDesc(weights_tz, + platform::MKLDNNGetDataType(), + GetWeightsFormat(groups, is_conv3d)); return this->AcquireMemoryWithReorder( user_src_md, @@ -713,10 +688,10 @@ class ConvMKLDNNHandlerT auto weights_tz = phi::vectorize(filter->dims()); platform::GetGroupConvWeightsTz(weights_tz, groups); - auto user_src_md = platform::MKLDNNMemDesc( - weights_tz, - platform::MKLDNNGetDataType(), - GetWeightsFormat(filter->format(), groups, is_conv3d)); + auto user_src_md = + platform::MKLDNNMemDesc(weights_tz, + platform::MKLDNNGetDataType(), + GetWeightsFormat(groups, is_conv3d)); return this->AcquireMemoryWithReorder( user_src_md, @@ -747,13 +722,9 @@ class ConvMKLDNNHandlerT LOG(ERROR) << "Bias should be of type int32 but is " << bias->dtype(); } const K_Bias* bias_data = bias->data(); - auto user_bias_md = - platform::MKLDNNMemDesc(phi::vectorize(bias->dims()), - platform::MKLDNNGetDataType(), - MKLDNNMemoryFormat::x); return this->AcquireMemoryWithReorder( - user_bias_md, + bias->mem_desc(), this->fwd_pd_->bias_desc(), platform::to_void_cast(bias_data), "@bias_mem_p", @@ -776,22 +747,16 @@ class ConvMKLDNNHandlerT residual_mem_p->set_data_handle(residual_data); return residual_mem_p; } else { - auto user_residual_md = platform::MKLDNNMemDesc( - phi::vectorize(residual_param->dims()), - framework::ToMKLDNNDataType( - framework::TransToProtoVarType(residual_param->dtype())), - residual_param->format()); - - return this->AcquireMemoryFromPrimitive( - user_residual_md, residual_data, "@user_residual_data_mem_p"); + return this->AcquireMemoryFromPrimitive(residual_param->mem_desc(), + residual_data, + "@user_residual_data_mem_p"); } } std::shared_ptr AcquireDstMemoryWithResidual( framework::Tensor* output, const framework::Tensor* residual_param) { std::shared_ptr dst_memory_p; - if (residual_param->format() != - platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc())) { + if (residual_param->mem_desc() != this->fwd_pd_->dst_desc()) { auto residual_memory_p = this->AcquireResidualMemory(residual_param); dst_memory_p = this->template AcquireDstMemory(output); this->AcquireReorder(residual_memory_p, dst_memory_p); @@ -903,8 +868,7 @@ class ConvMKLDNNOpKernel : public framework::OpKernel { conv_p->execute(astream, args); astream.wait(); - output->set_layout(framework::DataLayout::kMKLDNN); - output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); + output->set_mem_desc(dst_memory_p->get_desc()); } template @@ -1018,8 +982,7 @@ class ConvMKLDNNOpKernel : public framework::OpKernel { output->mutable_data(ctx.GetPlace()); } - output->set_layout(framework::DataLayout::kMKLDNN); - output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); + output->set_mem_desc(dst_memory_p->get_desc()); } }; @@ -1078,7 +1041,6 @@ class ConvMKLDNNGradOpKernel : public framework::OpKernel { auto conv_bwd_weights_p = handler.AcquireBackwardWeightsPrimitive(); - // TODO(grygielski) why no bias_diff? conv_bwd_weights_p->execute( astream, {{DNNL_ARG_SRC, *src_memory_p}, diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc index cd81168753bed..80163389318aa 100644 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc @@ -59,11 +59,6 @@ class ConvTransposeMKLDNNHandlerT DataLayout::kMKLDNN, platform::errors::InvalidArgument( "Got wrong layout = %d for Input tensor.", input->layout())); - PADDLE_ENFORCE_NE(input->format(), - MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Input tensor. The input " - "format is undefined.")); PADDLE_ENFORCE_EQ( filter->layout(), @@ -72,10 +67,6 @@ class ConvTransposeMKLDNNHandlerT "The filter tensor's layout should be %d, but got %d.", DataLayout::kMKLDNN, filter->layout())); - PADDLE_ENFORCE_NE(filter->format(), - MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong formats for Filter tensor.")); PADDLE_ENFORCE_EQ( input->dims().size(), @@ -98,10 +89,6 @@ class ConvTransposeMKLDNNHandlerT "The bias tensor's laytout should be %d, but got %d.", DataLayout::kMKLDNN, bias->layout())); - PADDLE_ENFORCE_NE(bias->format(), - MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Bias tensor.")); PADDLE_ENFORCE_EQ( bias->dims().size(), @@ -233,11 +220,8 @@ class ConvTransposeMKLDNNHandlerT std::shared_ptr AcquireSrcMemoryWithReorder( const framework::Tensor* input) { const T* input_data = input->data(); - auto user_src_md = platform::MKLDNNMemDesc(phi::vectorize(input->dims()), - platform::MKLDNNGetDataType(), - input->format()); return platform::MKLDNNHandlerNoCachingT:: - AcquireMemoryWithReorder(user_src_md, + AcquireMemoryWithReorder(input->mem_desc(), this->fwd_pd_->src_desc(), platform::to_void_cast(input_data)); } @@ -427,8 +411,7 @@ class ConvTransposeMKLDNNOpKernel : public framework::OpKernel { auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); conv_p->execute(astream, args); astream.wait(); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); + output->set_mem_desc(dst_memory_p->get_desc()); } }; diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc index 5c3dd0cb1234a..2f9fa210e225a 100644 --- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc @@ -446,7 +446,6 @@ class MatMulMKLDNNHandler if (scale_out != 1.0f) { matmul_attrs.set_output_scales(0, {scale_out}); } - paddle::platform::AppendActivation(ctx, post_operations); matmul_attrs.set_post_ops(post_operations); @@ -660,7 +659,7 @@ float ComputeOutputScale(const ExecutionContext &ctx) { return alpha * scale_out / (scale_x * scale_y); } -template +template void ExecuteMatMulV2(const ExecutionContext &ctx, const MKLDNNDeviceContext &dev_ctx, const dnnl::engine onednn_engine, @@ -676,16 +675,16 @@ void ExecuteMatMulV2(const ExecutionContext &ctx, int execution_number = 0) { std::vector x_strides_override = GetInputStrides(ctx, "X"); std::vector y_strides_override = GetInputStrides(ctx, "Y"); - MatMulV2MKLDNNHandler handler(ctx, - onednn_engine, - ctx.GetPlace(), - x_dims, - trans_x, - y_dims, - trans_y, - IsOutputFused(ctx), - x_strides_override, - y_strides_override); + MatMulV2MKLDNNHandler handler(ctx, + onednn_engine, + ctx.GetPlace(), + x_dims, + trans_x, + y_dims, + trans_y, + IsOutputFused(ctx), + x_strides_override, + y_strides_override); const auto src_memory_p = handler.AcquireSrcMemory(x); const auto weights_memory_p = handler.AcquireWeightsMemory(y); @@ -698,20 +697,51 @@ void ExecuteMatMulV2(const ExecutionContext &ctx, {DNNL_ARG_WEIGHTS, *weights_memory_p}, {DNNL_ARG_DST, *dst_memory_p}}; + if (ctx.HasInput("ResidualData")) { + auto *residual_data = ctx.Input("ResidualData"); + const auto residual_data_memory_p = handler.AcquireSrcMemory(residual_data); + matmul_args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, + *residual_data_memory_p}); + } + auto &astream = MKLDNNDeviceContext::tls().get_stream(); matmul_p->execute(astream, matmul_args); astream.wait(); - - auto format = paddle::platform::MKLDNNFormatForSize( - out->dims().size(), dnnl::memory::format_tag::nchw); - out->set_layout(paddle::framework::DataLayout::kMKLDNN); + auto format = + MKLDNNFormatForSize(out->dims().size(), dnnl::memory::format_tag::nchw); out->set_format(format); + out->set_layout(DataLayout::kMKLDNN); } template class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel { public: - void Compute(const ExecutionContext &ctx) const override { RunKernel(ctx); } + void Compute(const ExecutionContext &ctx) const override { + if (ctx.HasAttr("head_number")) { + PADDLE_ENFORCE_EQ( + ctx.Attr("head_number"), + 1, + paddle::platform::errors::Unimplemented( + "oneDNN matmul doesn't support multiple heads. Expected " + "head_number=1. But received `head_number` is %d", + ctx.Attr("head_number"))); + } + constexpr bool is_int8 = IsInt8(); + constexpr bool is_bfloat16 = IsBfloat16(); + const bool force_fp32_output = ctx.HasAttr("force_fp32_output") + ? ctx.Attr("force_fp32_output") + : false; + constexpr bool fuse_relu = false; // TODO(intel): Enable eltwise fuses + if (force_fp32_output || ((!is_int8) && (!is_bfloat16))) { + RunKernel(ctx); + } else if (is_bfloat16) { + RunKernel(ctx); + } else if (fuse_relu) { + RunKernel(ctx); + } else { + RunKernel(ctx); + } + } private: void CalculateMatrixDims(const ExecutionContext &ctx, @@ -762,6 +792,7 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel { } } + template void RunKernel(const ExecutionContext &ctx) const { const auto &dev_ctx = ctx.template device_context(); const auto &onednn_engine = dev_ctx.GetEngine(); @@ -787,18 +818,18 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel { CalculateMatrixDims( ctx, x_dims, y_dims, &x_bd_dims, &y_bd_dims, &out_dims, out); - ExecuteMatMulV2(ctx, - dev_ctx, - onednn_engine, - ctx.GetPlace(), - x, - x_bd_dims, - trans_x, - y, - y_bd_dims, - trans_y, - out, - out_dims); + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + x, + x_bd_dims, + trans_x, + y, + y_bd_dims, + trans_y, + out, + out_dims); } }; @@ -933,113 +964,113 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel { ctx, &dx_tmp, &dy_tmp, x_dims, y_dims, &dx_bd_dims, &dy_bd_dims); if (trans_x && trans_y) { - ExecuteMatMulV2(ctx, - dev_ctx, - onednn_engine, - ctx.GetPlace(), - y, - y_dims, - true, - dout, - dout_dims, - true, - &dx_tmp, - dx_bd_dims, - 1); - ExecuteMatMulV2(ctx, - dev_ctx, - onednn_engine, - ctx.GetPlace(), - dout, - dout_dims, - true, - x, - x_dims, - true, - &dy_tmp, - dy_bd_dims, - 2); + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + y, + y_dims, + true, + dout, + dout_dims, + true, + &dx_tmp, + dx_bd_dims, + 1); + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + dout, + dout_dims, + true, + x, + x_dims, + true, + &dy_tmp, + dy_bd_dims, + 2); } else if (trans_x) { - ExecuteMatMulV2(ctx, - dev_ctx, - onednn_engine, - ctx.GetPlace(), - y, - y_dims, - false, - dout, - dout_dims, - true, - &dx_tmp, - dx_bd_dims, - 1); - ExecuteMatMulV2(ctx, - dev_ctx, - onednn_engine, - ctx.GetPlace(), - x, - x_dims, - false, - dout, - dout_dims, - false, - &dy_tmp, - dy_bd_dims, - 2); + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + y, + y_dims, + false, + dout, + dout_dims, + true, + &dx_tmp, + dx_bd_dims, + 1); + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + x, + x_dims, + false, + dout, + dout_dims, + false, + &dy_tmp, + dy_bd_dims, + 2); } else if (trans_y) { - ExecuteMatMulV2(ctx, - dev_ctx, - onednn_engine, - ctx.GetPlace(), - dout, - dout_dims, - false, - y, - y_dims, - false, - &dx_tmp, - dx_bd_dims, - 1); - ExecuteMatMulV2(ctx, - dev_ctx, - onednn_engine, - ctx.GetPlace(), - dout, - dout_dims, - true, - x, - x_dims, - false, - &dy_tmp, - dy_bd_dims, - 2); + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + dout, + dout_dims, + false, + y, + y_dims, + false, + &dx_tmp, + dx_bd_dims, + 1); + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + dout, + dout_dims, + true, + x, + x_dims, + false, + &dy_tmp, + dy_bd_dims, + 2); } else { - ExecuteMatMulV2(ctx, - dev_ctx, - onednn_engine, - ctx.GetPlace(), - dout, - dout_dims, - false, - y, - y_dims, - true, - &dx_tmp, - dx_bd_dims, - 1); - ExecuteMatMulV2(ctx, - dev_ctx, - onednn_engine, - ctx.GetPlace(), - x, - x_dims, - true, - dout, - dout_dims, - false, - &dy_tmp, - dy_bd_dims, - 2); + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + dout, + dout_dims, + false, + y, + y_dims, + true, + &dx_tmp, + dx_bd_dims, + 1); + ExecuteMatMulV2(ctx, + dev_ctx, + onednn_engine, + ctx.GetPlace(), + x, + x_dims, + true, + dout, + dout_dims, + false, + &dy_tmp, + dy_bd_dims, + 2); } if (x_dims != dx_bd_dims) { @@ -1228,34 +1259,13 @@ template class MatMulGradMKLDNNKernel; namespace ops = paddle::operators; -REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(matmul, - MKLDNN, - ::paddle::platform::CPUPlace, - S8, - 0, - MatMulMKLDNNKernel); - -REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(matmul, - MKLDNN, - ::paddle::platform::CPUPlace, - U8, - 0, - MatMulMKLDNNKernel); - -REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(matmul, - MKLDNN, - ::paddle::platform::CPUPlace, - FP32, - 0, - MatMulV2MKLDNNKernel); - -REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( - matmul, - MKLDNN, - ::paddle::platform::CPUPlace, - BF16, - 0, - MatMulV2MKLDNNKernel); +REGISTER_OP_KERNEL(matmul, + MKLDNN, + ::paddle::platform::CPUPlace, + MatMulV2MKLDNNKernel, + MatMulV2MKLDNNKernel, + MatMulV2MKLDNNKernel, + MatMulV2MKLDNNKernel); REGISTER_OP_KERNEL(matmul_grad, MKLDNN, @@ -1267,7 +1277,9 @@ REGISTER_OP_KERNEL(matmul_v2, MKLDNN, ::paddle::platform::CPUPlace, MatMulV2MKLDNNKernel, - MatMulV2MKLDNNKernel); + MatMulV2MKLDNNKernel, + MatMulV2MKLDNNKernel, + MatMulV2MKLDNNKernel); REGISTER_OP_KERNEL(matmul_v2_grad, MKLDNN, diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc index 956dbc810fa48..e727a4fe9fb48 100644 --- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc @@ -416,16 +416,16 @@ class MulMKLDNNKernel : public framework::OpKernel { bool trans_y, Tensor *out) const { static const std::vector vec_placeholder; - MatMulV2MKLDNNHandler handler(ctx, - onednn_engine, - ctx.GetPlace(), - x_dims, - trans_x, - y_dims, - trans_y, - false, - vec_placeholder, - vec_placeholder); + MatMulV2MKLDNNHandler handler(ctx, + onednn_engine, + ctx.GetPlace(), + x_dims, + trans_x, + y_dims, + trans_y, + false, + vec_placeholder, + vec_placeholder); const auto src_memory_p = handler.AcquireSrcMemory(x); const auto weights_memory_p = handler.AcquireWeightsMemory(y); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 076460c4b7642..f05bd2635116c 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -25,13 +25,14 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(elementwise_mul); USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN); USE_OP_ITSELF(relu); -USE_OP_DEVICE_KERNEL(relu, MKLDNN); +PD_DECLARE_KERNEL(relu, OneDNN, ALL_LAYOUT); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP_ITSELF(conv2d); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index 2a9f07494c5d4..196e018507069 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -30,7 +30,7 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(relu); -USE_OP_DEVICE_KERNEL(relu, MKLDNN); +PD_DECLARE_KERNEL(relu, OneDNN, ALL_LAYOUT); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 6303a13745384..db590807179d9 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -30,7 +30,7 @@ USE_OP_ITSELF(pool2d); USE_OP_DEVICE_KERNEL(pool2d, MKLDNN); USE_OP_ITSELF(relu); -USE_OP_DEVICE_KERNEL(relu, MKLDNN); +PD_DECLARE_KERNEL(relu, OneDNN, ALL_LAYOUT); USE_OP_ITSELF(transpose); USE_OP_DEVICE_KERNEL(transpose, MKLDNN); USE_OP_ITSELF(shape); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu index e7d795ccc579c..d922b2a30cf90 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu @@ -15,15 +15,18 @@ #include "paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/optimizers/cast_with_ptr.h" -#include "paddle/fluid/operators/tensor_to_string.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/kernels/funcs/algorithm.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/tensor_to_string.h" namespace paddle { namespace operators { +using phi::funcs::FlattenToString; +using phi::funcs::ToVector; + struct ParamGradInfo { framework::Tensor *param_t{nullptr}; framework::Tensor *grad_t{nullptr}; diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index f1b852301d4d9..5e6c43aa12712 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -19,12 +19,12 @@ #include "paddle/fluid/operators/optimizers/cast_with_ptr.h" #include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h" #include "paddle/fluid/operators/optimizers/multi_tensor_apply.h" -#include "paddle/fluid/operators/tensor_to_string.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" +#include "paddle/phi/kernels/funcs/tensor_to_string.h" #ifdef __NVCC__ #include "cub/cub.cuh" @@ -43,6 +43,8 @@ namespace operators { template using MasterT = typename details::MPTypeTrait::Type; +using phi::funcs::FlattenToString; +using phi::funcs::ToVector; template static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) { @@ -1697,37 +1699,39 @@ class DistributedFusedLambOpKernel // (1) ReduceScater first if (local_shard) { if (use_hierarchical_allreduce) { - NCCLAllReduceWithScale(fp32_grad, - fp32_sum_grad, - fp32_numel, - nranks / num_devices, - external_comm, - stream, - dev_ctx); NCCLReduceScatterWithScale( - fp32_sum_grad, + fp32_grad, fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_numel_each_device, num_devices, local_comm, stream, dev_ctx); + NCCLAllReduceWithScale( + fp32_sum_grad + local_rank * fp32_numel_each_device, + fp32_sum_grad + local_rank * fp32_numel_each_device, + fp32_numel_each_device, + nranks / num_devices, + external_comm, + stream, + dev_ctx); - NCCLAllReduceWithScale(fp16_grad, - fp16_sum_grad, - fp16_numel, - nranks / num_devices, - external_comm, - stream, - dev_ctx); NCCLReduceScatterWithScale( - fp16_sum_grad, + fp16_grad, fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_numel_each_device, num_devices, local_comm, stream, dev_ctx); + NCCLAllReduceWithScale( + fp16_sum_grad + local_rank * fp16_numel_each_device, + fp16_sum_grad + local_rank * fp16_numel_each_device, + fp16_numel_each_device, + nranks / num_devices, + external_comm, + stream, + dev_ctx); } else { NCCLAllReduceWithScale(fp32_grad, fp32_sum_grad, @@ -1839,38 +1843,40 @@ class DistributedFusedLambOpKernel // (3) Do ReduceScatter with scale if (local_shard) { if (use_hierarchical_allreduce) { - NCCLAllReduceWithScale(fp32_grad, - fp32_sum_grad, - fp32_numel, - nranks / num_devices, - external_comm, - stream, - dev_ctx, - fp32_scale); NCCLReduceScatterWithScale( - fp32_sum_grad, + fp32_grad, fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_numel_each_device, num_devices, local_comm, stream, + dev_ctx, + fp32_scale); + NCCLAllReduceWithScale( + fp32_sum_grad + local_rank * fp32_numel_each_device, + fp32_sum_grad + local_rank * fp32_numel_each_device, + fp32_numel_each_device, + nranks / num_devices, + external_comm, + stream, dev_ctx); - NCCLAllReduceWithScale(fp16_grad, - fp16_sum_grad, - fp16_numel, - nranks / num_devices, - external_comm, - stream, - dev_ctx, - fp16_scale); NCCLReduceScatterWithScale( - fp16_sum_grad, + fp16_grad, fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_numel_each_device, num_devices, local_comm, stream, + dev_ctx, + fp16_scale); + NCCLAllReduceWithScale( + fp16_sum_grad + local_rank * fp16_numel_each_device, + fp16_sum_grad + local_rank * fp16_numel_each_device, + fp16_numel_each_device, + nranks / num_devices, + external_comm, + stream, dev_ctx); } else { NCCLAllReduceWithScale(fp32_grad, @@ -1917,37 +1923,39 @@ class DistributedFusedLambOpKernel } else { if (local_shard) { if (use_hierarchical_allreduce) { - NCCLAllReduceWithScale(fp32_grad, - fp32_sum_grad, - fp32_numel, - nranks / num_devices, - external_comm, - stream, - dev_ctx); NCCLReduceScatterWithScale( - fp32_sum_grad, + fp32_grad, fp32_sum_grad + local_rank * fp32_numel_each_device, fp32_numel_each_device, num_devices, local_comm, stream, dev_ctx); + NCCLAllReduceWithScale( + fp32_sum_grad + local_rank * fp32_numel_each_device, + fp32_sum_grad + local_rank * fp32_numel_each_device, + fp32_numel_each_device, + nranks / num_devices, + external_comm, + stream, + dev_ctx); - NCCLAllReduceWithScale(fp16_grad, - fp16_sum_grad, - fp16_numel, - nranks / num_devices, - external_comm, - stream, - dev_ctx); NCCLReduceScatterWithScale( - fp16_sum_grad, + fp16_grad, fp16_sum_grad + local_rank * fp16_numel_each_device, fp16_numel_each_device, num_devices, local_comm, stream, dev_ctx); + NCCLAllReduceWithScale( + fp16_sum_grad + local_rank * fp16_numel_each_device, + fp16_sum_grad + local_rank * fp16_numel_each_device, + fp16_numel_each_device, + nranks / num_devices, + external_comm, + stream, + dev_ctx); } else { NCCLAllReduceWithScale(fp32_grad, fp32_sum_grad, diff --git a/paddle/fluid/operators/optimizers/lamb_op.cc b/paddle/fluid/operators/optimizers/lamb_op.cc index 8434da2bb0e76..cc3c99f9b1129 100644 --- a/paddle/fluid/operators/optimizers/lamb_op.cc +++ b/paddle/fluid/operators/optimizers/lamb_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/optimizers/lamb_op.h" - #include - +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/infermeta/multiary.h" +#include "paddle/phi/kernels/lamb_kernel.h" namespace paddle { namespace operators { @@ -25,125 +29,6 @@ class LambOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), - true, - platform::errors::NotFound( - "Input(Param) of LambOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), - true, - platform::errors::NotFound( - "Input(Grad) of LambOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Moment1"), - true, - platform::errors::NotFound( - "Input(Moment1) of LambOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Moment2"), - true, - platform::errors::NotFound( - "Input(Moment2) of LambOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), - true, - platform::errors::NotFound( - "Input(LearningRate) of LambOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Beta1Pow"), - true, - platform::errors::NotFound( - "Input(Beta1Pow) of LambOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Beta2Pow"), - true, - platform::errors::NotFound( - "Input(Beta2Pow) of LambOp should not be null.")); - - PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), - true, - platform::errors::NotFound( - "Output(ParamOut) of LambOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment1Out"), - true, - platform::errors::NotFound( - "Output(Moment1Out) of LambOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment2Out"), - true, - platform::errors::NotFound( - "Output(Moment2Out) of LambOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Beta1PowOut"), - true, - platform::errors::NotFound( - "Output(Beta1PowOut) of LambOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Beta2PowOut"), - true, - platform::errors::NotFound( - "Output(Beta2PowOut) of LambOp should not be null.")); - - auto lr_dims = ctx->GetInputDim("LearningRate"); - PADDLE_ENFORCE_NE( - phi::product(lr_dims), - 0, - platform::errors::InvalidArgument( - "The number of LearningRate shall not be 0, but received %d. Maybe " - "the Input variable LearningRate has not " - "been initialized. You may need to confirm " - "if you put exe.run(startup_program) " - "after optimizer.minimize function.", - phi::product(lr_dims))); - PADDLE_ENFORCE_EQ( - phi::product(lr_dims), - 1, - platform::errors::InvalidArgument( - "Learning rate should have 1 dimension, but received %d.", - phi::product(lr_dims))); - auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); - PADDLE_ENFORCE_GE(phi::product(beta1_pow_dims), - 1, - platform::errors::InvalidArgument( - "The size of Beta1 power accumulator should be " - "greater than 0, but received %d.", - phi::product(beta1_pow_dims))); - auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow"); - PADDLE_ENFORCE_GE(phi::product(beta2_pow_dims), - 1, - platform::errors::InvalidArgument( - "The size of Beta2 power accumulator should be " - "greater than 0, but received %d.", - phi::product(beta2_pow_dims))); - - auto param_dims = ctx->GetInputDim("Param"); - if (ctx->GetInputsVarType("Grad")[0] == - framework::proto::VarType::LOD_TENSOR) { - PADDLE_ENFORCE_EQ( - param_dims, - ctx->GetInputDim("Grad"), - platform::errors::InvalidArgument( - "Param and Grad input of LambOp should have same dimension. But " - "received Param dims: [%s], Grad dims: [%s].", - param_dims, - ctx->GetInputDim("Grad"))); - } - PADDLE_ENFORCE_EQ( - param_dims, - ctx->GetInputDim("Moment1"), - platform::errors::InvalidArgument( - "Param and Moment1 input of LambOp should have same dimension. But " - "received Param dims: [%s], Moment1 dims: [%s].", - param_dims, - ctx->GetInputDim("Moment1"))); - PADDLE_ENFORCE_EQ( - param_dims, - ctx->GetInputDim("Moment2"), - platform::errors::InvalidArgument( - "Param and Moment2 input of LambOp should have same dimension. But " - "received Param dims: [%s], Moment2 dims: [%s].", - param_dims, - ctx->GetInputDim("Moment2"))); - - ctx->SetOutputDim("ParamOut", param_dims); - ctx->SetOutputDim("Moment1Out", param_dims); - ctx->SetOutputDim("Moment2Out", param_dims); - ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims); - ctx->SetOutputDim("Beta2PowOut", beta2_pow_dims); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const { auto input_data_type = @@ -246,10 +131,16 @@ learning rate, $\lambda$ the weight decay rate. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(lamb, ops::LambOp, ops::LambOpMaker); -REGISTER_OP_CPU_KERNEL(lamb, - ops::LambOpKernel, - ops::LambOpKernel); +DECLARE_INFER_SHAPE_FUNCTOR(lamb, + LambInferMetaFunctor, + PD_INFER_META(phi::LambInferMeta)); +REGISTER_OPERATOR( + lamb, + ops::LambOp, + ops::LambOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + LambInferMetaFunctor); /* ========================== register checkpoint ===========================*/ REGISTER_OP_VERSION(lamb).AddCheckpoint( diff --git a/paddle/fluid/operators/optimizers/lamb_op.h b/paddle/fluid/operators/optimizers/lamb_op.h deleted file mode 100644 index 0415bb7df02ac..0000000000000 --- a/paddle/fluid/operators/optimizers/lamb_op.h +++ /dev/null @@ -1,813 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include // for sqrt in CPU and CUDA - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/memory/buffer.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/fluid/operators/tensor_to_string.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/algorithm.h" -#include "paddle/phi/kernels/funcs/eigen/extensions.h" -#include "paddle/phi/kernels/funcs/squared_l2_norm.h" - -namespace paddle { -namespace operators { - -namespace scatter = paddle::operators::math::scatter; - -template -struct LambMomentREGUpdateFunctor { - using MT = typename std::conditional::Type, - T>::type; - - MT weight_decay_; - MT beta1_; - MT beta2_; - MT epsilon_; - - MT beta1_pow_; - MT* beta1_pow_out_; - MT beta2_pow_; - MT* beta2_pow_out_; - const MT* moment1_; - MT* moment1_out_; - const MT* moment2_; - MT* moment2_out_; - const T* grad_; - const MT* param_; - MT* trust_ratio_div_; - const bool* skip_update_; - - LambMomentREGUpdateFunctor(MT weight_decay, - MT beta1, - MT beta2, - MT epsilon, - MT beta1_pow, - MT beta2_pow, - const MT* mom1, - MT* mom1_out, - const MT* mom2, - MT* mom2_out, - const T* grad, - const MT* param, - MT* trust_ratio_div, - const bool* skip_update) - : weight_decay_(weight_decay), - beta1_(beta1), - beta2_(beta2), - epsilon_(epsilon), - beta1_pow_(beta1_pow), - beta2_pow_(beta2_pow), - moment1_(mom1), - moment1_out_(mom1_out), - moment2_(mom2), - moment2_out_(mom2_out), - grad_(grad), - param_(param), - trust_ratio_div_(trust_ratio_div), - skip_update_(skip_update) {} - - inline HOSTDEVICE void operator()(size_t i) const { - if (skip_update_ && *skip_update_) return; - - MT g = static_cast(grad_[i]); - MT mom1 = moment1_[i]; - MT mom2 = moment2_[i]; - MT beta1_pow = beta1_pow_; - MT beta2_pow = beta2_pow_; - MT p = param_[i]; - - mom1 = beta1_ * mom1 + (static_cast(1) - beta1_) * g; - mom2 = beta2_ * mom2 + (static_cast(1) - beta2_) * g * g; - - moment1_out_[i] = mom1; - moment2_out_[i] = mom2; - - MT mom1_unbiased = mom1 / (static_cast(1) - beta1_pow); - MT mom2_unbiased = mom2 / (static_cast(1) - beta2_pow); - trust_ratio_div_[i] = - mom1_unbiased / (Eigen::numext::sqrt(mom2_unbiased) + epsilon_) + - weight_decay_ * p; - } -}; - -template -struct LambMomentMENUpdateFunctor { - using MT = typename std::conditional::Type, - T>::type; - - MT weight_decay_; - MT beta1_; - MT beta2_; - MT epsilon_; - - const MT* beta1_pow_; - const MT* beta2_pow_; - const MT* moment1_; - MT* moment1_out_; - const MT* moment2_; - MT* moment2_out_; - const T* grad_; - const MT* param_; - MT* trust_ratio_div_; - const bool* skip_update_; - - LambMomentMENUpdateFunctor(MT weight_decay, - MT beta1, - MT beta2, - MT epsilon, - const MT* beta1_pow, - const MT* beta2_pow, - const MT* mom1, - MT* mom1_out, - const MT* mom2, - MT* mom2_out, - const T* grad, - const MT* param, - MT* trust_ratio_div, - const bool* skip_update) - : weight_decay_(weight_decay), - beta1_(beta1), - beta2_(beta2), - epsilon_(epsilon), - beta1_pow_(beta1_pow), - beta2_pow_(beta2_pow), - moment1_(mom1), - moment1_out_(mom1_out), - moment2_(mom2), - moment2_out_(mom2_out), - grad_(grad), - param_(param), - trust_ratio_div_(trust_ratio_div), - skip_update_(skip_update) {} - - inline HOSTDEVICE void operator()(size_t i) const { - if (skip_update_ && *skip_update_) return; - MT g = static_cast(grad_[i]); - MT mom1 = moment1_[i]; - MT mom2 = moment2_[i]; - MT beta1_pow = *beta1_pow_; - MT beta2_pow = *beta2_pow_; - MT p = param_[i]; - - mom1 = beta1_ * mom1 + (static_cast(1) - beta1_) * g; - mom2 = beta2_ * mom2 + (static_cast(1) - beta2_) * g * g; - - moment1_out_[i] = mom1; - moment2_out_[i] = mom2; - - MT mom1_unbiased = mom1 / (static_cast(1) - beta1_pow); - MT mom2_unbiased = mom2 / (static_cast(1) - beta2_pow); - trust_ratio_div_[i] = - mom1_unbiased / (Eigen::numext::sqrt(mom2_unbiased) + epsilon_) + - weight_decay_ * p; - } -}; - -template -struct SparseLambMomentREGUpdateFunctor { - T weight_decay_; - T beta1_; - T beta2_; - T epsilon_; - - T beta1_pow_; - T beta2_pow_; - const T* moment1_; - T* moment1_out_; - const T* moment2_; - T* moment2_out_; - const T* grad_; - const T* param_; - T* trust_ratio_div_; - - const int64_t* rows_; - int64_t row_numel_; - int64_t row_count_; - - const bool* skip_update_; - - SparseLambMomentREGUpdateFunctor(T weight_decay, - T beta1, - T beta2, - T epsilon, - T beta1_pow, - T beta2_pow, - const T* mom1, - T* mom1_out, - const T* mom2, - T* mom2_out, - const T* grad, - const T* param, - T* trust_ratio_div, - const int64_t* rows, - int64_t row_numel, - int64_t row_count, - const bool* skip_update) - : weight_decay_(weight_decay), - beta1_(beta1), - beta2_(beta2), - epsilon_(epsilon), - beta1_pow_(beta1_pow), - beta2_pow_(beta2_pow), - moment1_(mom1), - moment1_out_(mom1_out), - moment2_(mom2), - moment2_out_(mom2_out), - grad_(grad), - param_(param), - trust_ratio_div_(trust_ratio_div), - rows_(rows), - row_numel_(row_numel), - row_count_(row_count), - skip_update_(skip_update) {} - - inline HOSTDEVICE void update(size_t i, T g) const { - // The following code is same as dense - T mom1 = moment1_[i]; - T mom2 = moment2_[i]; - T beta1_pow = beta1_pow_; - T beta2_pow = beta2_pow_; - T p = param_[i]; - - mom1 = beta1_ * mom1 + (static_cast(1) - beta1_) * g; - mom2 = beta2_ * mom2 + (static_cast(1) - beta2_) * g * g; - - moment1_out_[i] = mom1; - moment2_out_[i] = mom2; - - T mom1_unbiased = mom1 / (static_cast(1) - beta1_pow); - T mom2_unbiased = mom2 / (static_cast(1) - beta2_pow); - trust_ratio_div_[i] = - mom1_unbiased / (Eigen::numext::sqrt(mom2_unbiased) + epsilon_) + - weight_decay_ * p; - } - - inline HOSTDEVICE void operator()(size_t i) const { - if (skip_update_ && *skip_update_) return; - auto row_idx = - phi::funcs::BinarySearch(rows_, row_count_, i / row_numel_); - T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] - : static_cast(0); - update(i, g); - } -}; - -template -struct SparseLambMomentMENUpdateFunctor { - T weight_decay_; - T beta1_; - T beta2_; - T epsilon_; - - const T* beta1_pow_; - const T* beta2_pow_; - const T* moment1_; - T* moment1_out_; - const T* moment2_; - T* moment2_out_; - const T* grad_; - const T* param_; - T* trust_ratio_div_; - - const int64_t* rows_; - int64_t row_numel_; - int64_t row_count_; - - const bool* skip_update_; - - SparseLambMomentMENUpdateFunctor(T weight_decay, - T beta1, - T beta2, - T epsilon, - const T* beta1_pow, - const T* beta2_pow, - const T* mom1, - T* mom1_out, - const T* mom2, - T* mom2_out, - const T* grad, - const T* param, - T* trust_ratio_div, - const int64_t* rows, - int64_t row_numel, - int64_t row_count, - const bool* skip_update) - : weight_decay_(weight_decay), - beta1_(beta1), - beta2_(beta2), - epsilon_(epsilon), - beta1_pow_(beta1_pow), - beta2_pow_(beta2_pow), - moment1_(mom1), - moment1_out_(mom1_out), - moment2_(mom2), - moment2_out_(mom2_out), - grad_(grad), - param_(param), - trust_ratio_div_(trust_ratio_div), - rows_(rows), - row_numel_(row_numel), - row_count_(row_count), - skip_update_(skip_update) {} - - inline HOSTDEVICE void update(size_t i, T g) const { - // The following code is same as dense - T mom1 = moment1_[i]; - T mom2 = moment2_[i]; - T beta1_pow = *beta1_pow_; - T beta2_pow = *beta2_pow_; - T p = param_[i]; - - mom1 = beta1_ * mom1 + (static_cast(1) - beta1_) * g; - mom2 = beta2_ * mom2 + (static_cast(1) - beta2_) * g * g; - - moment1_out_[i] = mom1; - moment2_out_[i] = mom2; - - T mom1_unbiased = mom1 / (static_cast(1) - beta1_pow); - T mom2_unbiased = mom2 / (static_cast(1) - beta2_pow); - trust_ratio_div_[i] = - mom1_unbiased / (Eigen::numext::sqrt(mom2_unbiased) + epsilon_) + - weight_decay_ * p; - } - - inline HOSTDEVICE void operator()(size_t i) const { - if (skip_update_ && *skip_update_) return; - auto row_idx = - phi::funcs::BinarySearch(rows_, row_count_, i / row_numel_); - T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] - : static_cast(0); - update(i, g); - } -}; - -template -struct LambBetaPowUpdateFunctor { - void SetBetaPows(const MT* beta1pow, - const MT* beta2pow, - MT* beta1pow_out, - MT* beta2pow_out, - MT beta1, - MT beta2) { - beta1pow_ = beta1pow; - beta2pow_ = beta2pow; - beta1pow_out_ = beta1pow_out; - beta2pow_out_ = beta2pow_out; - beta1_ = beta1; - beta2_ = beta2; - } - - HOSTDEVICE void UpdateBetaPow(size_t i) const { - if (i == 0) { - beta1pow_out_[0] = beta1pow_[0] * beta1_; - beta2pow_out_[0] = beta2pow_[0] * beta2_; - } - } - - private: - const MT* beta1pow_; - const MT* beta2pow_; - MT* beta1pow_out_; - MT* beta2pow_out_; - MT beta1_; - MT beta2_; -}; - -template -struct LambBetaPowUpdateFunctor { - void SetBetaPows(const MT* beta1pow, - const MT* beta2pow, - MT* beta1pow_out, - MT* beta2pow_out, - MT beta1, - MT beta2) {} - HOSTDEVICE void UpdateBetaPow(size_t) const {} -}; - -template -struct LambParamUpateFunctor - : public LambBetaPowUpdateFunctor { - const MT* lr_; - const T* param_; - const MT* master_param_; - const MT* param_norm_; - const MT* trust_ratio_div_; - const MT* trust_ratio_div_norm_; - T* param_out_; - MT* master_param_out_; - - const bool* skip_update_; - - LambParamUpateFunctor(const MT* lr, - const T* param, - const MT* master_param, - const MT* param_norm, - const MT* trust_ratio_div, - const MT* trust_ratio_div_norm, - T* param_out, - MT* master_param_out, - const bool* skip_update) - : lr_(lr), - param_(param), - master_param_(master_param), - param_norm_(param_norm), - trust_ratio_div_(trust_ratio_div), - trust_ratio_div_norm_(trust_ratio_div_norm), - param_out_(param_out), - master_param_out_(master_param_out), - skip_update_(skip_update) {} - - inline HOSTDEVICE void operator()(size_t i) const { - if (skip_update_ && *skip_update_) return; - MT lr = *lr_; - MT pn = Eigen::numext::sqrt(*param_norm_); - MT tn = Eigen::numext::sqrt(*trust_ratio_div_norm_); - - MT r = (pn > static_cast(0) && tn > static_cast(0)) - ? pn / tn - : static_cast(1); - lr *= r; - MT p = IsMultiPrecision ? master_param_[i] : static_cast(param_[i]); - MT param_out = p - lr * trust_ratio_div_[i]; - param_out_[i] = static_cast(param_out); - if (IsMultiPrecision) { - master_param_out_[i] = param_out; - } - this->UpdateBetaPow(i); - } -}; - -template -class LambOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using MT = typename details::MPTypeTrait::Type; - bool multi_precision = ctx.Attr("multi_precision"); - if (multi_precision) { - ComputeImpl(ctx); - } else { - ComputeImpl(ctx); - } - } - - private: - template - void ComputeImpl(const framework::ExecutionContext& ctx) const { - if (!IsMultiPrecision) { - constexpr auto kIsSameType = std::is_same::value; - PADDLE_ENFORCE_EQ( - kIsSameType, - true, - platform::errors::InvalidArgument( - "When multi_precision=False, T and MT must be the same type.")); - } - const auto* skip_update = ctx.Input("SkipUpdate"); - const bool* skip_update_flag = skip_update && skip_update->IsInitialized() - ? skip_update->data() - : nullptr; - if (skip_update_flag && platform::is_cpu_place(skip_update->place()) && - (*skip_update_flag)) { - return; - } - - auto weight_decay = static_cast(ctx.Attr("weight_decay")); - auto beta1 = static_cast(ctx.Attr("beta1")); - auto beta2 = static_cast(ctx.Attr("beta2")); - auto epsilon = static_cast(ctx.Attr("epsilon")); - const auto& param = GET_DATA_SAFELY( - ctx.Input("Param"), "Input", "Param", "Lamb"); - const auto* grad_var = ctx.InputVar("Grad"); - const auto& mom1 = GET_DATA_SAFELY( - ctx.Input("Moment1"), "Input", "Moment1", "Lamb"); - const auto& mom2 = GET_DATA_SAFELY( - ctx.Input("Moment2"), "Input", "Moment2", "Lamb"); - const auto& lr = - GET_DATA_SAFELY(ctx.Input("LearningRate"), - "Input", - "LearningRate", - "Lamb"); - - const auto& beta1_pow = - GET_DATA_SAFELY(ctx.Input("Beta1Pow"), - "Input", - "Beta1Pow", - "Lamb"); - const auto& beta2_pow = - GET_DATA_SAFELY(ctx.Input("Beta2Pow"), - "Input", - "Beta2Pow", - "Lamb"); - - auto& param_out = - GET_DATA_SAFELY(ctx.Output("ParamOut"), - "Output", - "ParamOut", - "Lamb"); - auto& mom1_out = - GET_DATA_SAFELY(ctx.Output("Moment1Out"), - "Output", - "Moment1Out", - "Lamb"); - auto& mom2_out = - GET_DATA_SAFELY(ctx.Output("Moment2Out"), - "Output", - "Moment2Out", - "Lamb"); - auto& beta1_pow_out = - GET_DATA_SAFELY(ctx.Output("Beta1PowOut"), - "Output", - "Beta1PowOut", - "Lamb"); - auto& beta2_pow_out = - GET_DATA_SAFELY(ctx.Output("Beta2PowOut"), - "Output", - "Beta2PowOut", - "Lamb"); - const auto* master_param = - IsMultiPrecision ? ctx.Input("MasterParam") - : nullptr; - auto* master_param_out = - IsMultiPrecision ? ctx.Output("MasterParamOut") - : nullptr; - - if (IsMultiPrecision) { - PADDLE_ENFORCE_NOT_NULL(master_param, - platform::errors::InvalidArgument( - "Input(MasterParam) must be provided when " - "multi_precision=True.")); - PADDLE_ENFORCE_NOT_NULL(master_param_out, - platform::errors::InvalidArgument( - "Output(MasterParamOut) must be provided " - "when multi_precision=True.")); - } - - auto& dev_ctx = ctx.template device_context(); - auto numel = param.numel(); - platform::ForRange for_range(dev_ctx, numel); - auto trust_ratio_div = - ctx.AllocateTmpTensor(param.dims(), dev_ctx); - auto* trust_ratio_div_ptr = trust_ratio_div.template data(); - - const void* param_ptr = param.data(); - const void* master_param_ptr = - master_param ? master_param->data() : nullptr; - void* param_out_ptr = param_out.template mutable_data(ctx.GetPlace()); - void* master_param_out_ptr = - master_param_out - ? master_param_out->template mutable_data(ctx.GetPlace()) - : nullptr; - - // Update moments - bool should_update_beta_pow_later = false; - const MT *beta1_pow_ptr = nullptr, *beta2_pow_ptr = nullptr; - MT *beta1_pow_out_ptr = nullptr, *beta2_pow_out_ptr = nullptr; - VLOG(10) << "Beta1Pow place: " << beta1_pow.place() - << " , Beta2Pow place: " << beta2_pow.place(); - if (grad_var->IsType()) { - auto& grad = grad_var->Get(); - if (platform::is_gpu_place(ctx.GetPlace()) && - beta1_pow.place() == platform::CPUPlace() && - beta2_pow.place() == platform::CPUPlace()) { - LambMomentREGUpdateFunctor moment_update_functor( - weight_decay, - beta1, - beta2, - epsilon, - *beta1_pow.template data(), - *beta2_pow.template data(), - mom1.template data(), - mom1_out.template mutable_data(ctx.GetPlace()), - mom2.template data(), - mom2_out.template mutable_data(ctx.GetPlace()), - grad.template data(), - static_cast(IsMultiPrecision ? master_param_ptr - : param_ptr), - trust_ratio_div_ptr, - skip_update_flag); - for_range(moment_update_functor); - beta1_pow_out.template mutable_data(platform::CPUPlace())[0] = - beta1 * beta1_pow.template data()[0]; - beta2_pow_out.template mutable_data(platform::CPUPlace())[0] = - beta2 * beta2_pow.template data()[0]; - } else { - beta1_pow_ptr = beta1_pow.template data(); - beta2_pow_ptr = beta2_pow.template data(); - beta1_pow_out_ptr = - beta1_pow_out.template mutable_data(ctx.GetPlace()); - beta2_pow_out_ptr = - beta2_pow_out.template mutable_data(ctx.GetPlace()); - should_update_beta_pow_later = true; - LambMomentMENUpdateFunctor moment_update_functor( - weight_decay, - beta1, - beta2, - epsilon, - static_cast(beta1_pow_ptr), - static_cast(beta2_pow_ptr), - mom1.template data(), - mom1_out.template mutable_data(ctx.GetPlace()), - mom2.template data(), - mom2_out.template mutable_data(ctx.GetPlace()), - grad.template data(), - static_cast(IsMultiPrecision ? master_param_ptr - : param_ptr), - trust_ratio_div_ptr, - skip_update_flag); - for_range(moment_update_functor); - } - } else if (grad_var->IsType()) { - PADDLE_ENFORCE_EQ(IsMultiPrecision, - false, - platform::errors::Unimplemented( - "SelectedRows gradient is not supported when " - "multi_precision=True.")); - constexpr bool kIsSameType = std::is_same::value; - PADDLE_ENFORCE_EQ(kIsSameType, - true, - platform::errors::Unimplemented( - "SelectedRows gradient is not supported when " - "multi_precision=True.")); - auto& grad = GET_DATA_SAFELY( - ctx.Input("Grad"), "Input", "Grad", "Lamb"); - if (grad.rows().size() == 0) { - VLOG(3) << "grad row size is 0!!"; - return; - } - - std::vector cpu_rows(grad.rows().begin(), grad.rows().end()); - bool is_strict_sorted = true; - for (size_t i = 1; i < cpu_rows.size(); ++i) { - if (cpu_rows[i - 1] >= cpu_rows[i]) { - is_strict_sorted = false; - break; - } - } - - phi::SelectedRows tmp_grad_merge; - const phi::SelectedRows* grad_merge_ptr; - if (is_strict_sorted) { - grad_merge_ptr = &grad; - } else { - // merge duplicated rows if any. - // The rows of grad_merge have been sorted inside MergeAdd functor - scatter::MergeAdd merge_func; - merge_func(dev_ctx, grad, &tmp_grad_merge, true); - grad_merge_ptr = &tmp_grad_merge; - } - - auto& grad_merge = *grad_merge_ptr; - auto& grad_tensor = grad_merge.value(); - const T* grad_data = grad_tensor.template data(); - auto* grad_merge_rows = &grad_merge.rows(); - paddle::framework::MixVector mixv_grad_merge_rows( - grad_merge_rows); - const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace()); - auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); - if (platform::is_gpu_place(ctx.GetPlace()) && - beta1_pow.place() == platform::CPUPlace() && - beta2_pow.place() == platform::CPUPlace()) { - SparseLambMomentREGUpdateFunctor moment_update_functor( - static_cast(weight_decay), - static_cast(beta1), - static_cast(beta2), - static_cast(epsilon), - *beta1_pow.template data(), - *beta2_pow.template data(), - mom1.template data(), - mom1_out.template mutable_data(ctx.GetPlace()), - mom2.template data(), - mom2_out.template mutable_data(ctx.GetPlace()), - grad_data, - param.template data(), - trust_ratio_div.template data(), - rows, - row_numel, - grad_merge.rows().size(), - skip_update_flag); - for_range(moment_update_functor); - beta1_pow_out.template mutable_data(platform::CPUPlace())[0] = - static_cast(beta1) * beta1_pow.template data()[0]; - beta2_pow_out.template mutable_data(platform::CPUPlace())[0] = - static_cast(beta2) * beta2_pow.template data()[0]; - } else { - beta1_pow_ptr = beta1_pow.template data(); - beta2_pow_ptr = beta2_pow.template data(); - beta1_pow_out_ptr = - beta1_pow_out.template mutable_data(ctx.GetPlace()); - beta2_pow_out_ptr = - beta2_pow_out.template mutable_data(ctx.GetPlace()); - should_update_beta_pow_later = true; - SparseLambMomentMENUpdateFunctor moment_update_functor( - static_cast(weight_decay), - static_cast(beta1), - static_cast(beta2), - static_cast(epsilon), - reinterpret_cast(beta1_pow_ptr), - reinterpret_cast(beta2_pow_ptr), - mom1.template data(), - mom1_out.template mutable_data(ctx.GetPlace()), - mom2.template data(), - mom2_out.template mutable_data(ctx.GetPlace()), - grad_data, - param.template data(), - trust_ratio_div.template data(), - rows, - row_numel, - grad_merge.rows().size(), - skip_update_flag); - for_range(moment_update_functor); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Variable type not supported by lamb_op. Expect LoDTensor or " - "SelectedRows, but got %s", - framework::ToTypeName(grad_var->Type()))); - } - - // Update parameter - auto p_norm_t = ctx.AllocateTmpTensor({1}, dev_ctx); - auto* p_norm_ptr = p_norm_t.template data(); - - auto trust_ratio_div_norm_t = - ctx.AllocateTmpTensor({1}, dev_ctx); - auto* trust_ratio_div_norm_ptr = trust_ratio_div_norm_t.template data(); - - // TODO(zengjinle): remove the following Eigen operations when - // *skip_update == true. - memory::Buffer buffer(dev_ctx.GetPlace()); - phi::funcs::SquaredL2Norm( - dev_ctx, - reinterpret_cast(IsMultiPrecision ? master_param_ptr - : param_ptr), - p_norm_ptr, - numel, - &buffer); - phi::funcs::SquaredL2Norm( - dev_ctx, trust_ratio_div_ptr, trust_ratio_div_norm_ptr, numel, &buffer); - - if (VLOG_IS_ON(1)) { - const auto& name = ctx.GetOp().Input("Param"); - auto pn = ToVector(p_norm_ptr, 1, dev_ctx.GetPlace()); - auto tn = ToVector(trust_ratio_div_norm_ptr, 1, dev_ctx.GetPlace()); - auto dtype = - framework::DataTypeToString(framework::DataTypeTrait::DataType()); - VLOG(1) << "Param " << dtype << " " << name << " pn = " << pn[0] - << " , tn = " << tn[0]; - } - -#define CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC(__should_update_beta_pow) \ - do { \ - LambParamUpateFunctor \ - param_update_functor(lr.template data(), \ - static_cast(param_ptr), \ - static_cast(master_param_ptr), \ - p_norm_ptr, \ - trust_ratio_div_ptr, \ - trust_ratio_div_norm_ptr, \ - static_cast(param_out_ptr), \ - static_cast(master_param_out_ptr), \ - skip_update_flag); \ - if (__should_update_beta_pow) { \ - param_update_functor.SetBetaPows(beta1_pow_ptr, \ - beta2_pow_ptr, \ - beta1_pow_out_ptr, \ - beta2_pow_out_ptr, \ - beta1, \ - beta2); \ - } \ - for_range(param_update_functor); \ - } while (0) - - if (should_update_beta_pow_later) { - CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC(true); - } else { - CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC(false); - } - -#undef CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc index e0233fadb8858..bfeb42a221fa7 100644 --- a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "gflags/gflags.h" -#include "paddle/fluid/operators/optimizers/lamb_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_xpu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_xpu.cc new file mode 100644 index 0000000000000..3993a46add480 --- /dev/null +++ b/paddle/fluid/operators/optimizers/merged_momentum_op_xpu.cc @@ -0,0 +1,141 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifdef PADDLE_WITH_XPU +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/device/xpu/xpu_header.h" +#include "paddle/phi/kernels/impl/momentum_kernel_impl.h" +namespace paddle { +namespace operators { + +template +class MergedMomentumOpXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + T mu = static_cast(ctx.Attr("mu")); + auto params = ctx.MultiInput("Param"); + auto params_out = ctx.MultiOutput("ParamOut"); + auto lr = ctx.Input("LearningRate"); + int op_num = params.size(); + auto velocity = ctx.MultiInput("Velocity"); + auto grad = ctx.MultiInput("Grad"); + auto velocity_out = ctx.MultiOutput("VelocityOut"); + auto use_nesterov = ctx.Attr("use_nesterov"); + auto regularization_method = + ctx.Attr>("regularization_method"); + auto regularization_coeff = + ctx.Attr>("regularization_coeff"); + std::vector param_list(op_num); + std::vector velocity_list(op_num); + std::vector grad_list(op_num); + std::vector velocity_out_list(op_num); + std::vector param_out_list(op_num); + std::vector sizes(op_num); + std::vector l2_weight_decay(op_num); + if (op_num > 0) { + for (int j = 0; j < op_num; j++) { + param_list[j] = + reinterpret_cast(const_cast(params[j]->data())); + velocity_list[j] = + reinterpret_cast(const_cast(velocity[j]->data())); + grad_list[j] = + reinterpret_cast(const_cast(grad[j]->data())); + param_out_list[j] = + reinterpret_cast(params_out[j]->data()); + velocity_out_list[j] = + reinterpret_cast(velocity_out[j]->data()); + sizes[j] = static_cast(params[j]->numel()); + if (regularization_method[j] != "l2_decay") { + l2_weight_decay[j] = 0.0f; + } else { + l2_weight_decay[j] = static_cast(regularization_coeff[j]); + } + PADDLE_ENFORCE_EQ(params[j], + params_out[j], + platform::errors::InvalidArgument( + "The size of Input(Param) and Output(ParamOut) " + "must be the same Tensors.")); + PADDLE_ENFORCE_EQ( + velocity[j], + velocity_out[j], + platform::errors::InvalidArgument( + "The size of Input(velocity) and Output(velocity) " + "must be the same Tensors.")); + } + } else { + return; + } + auto& dev_ctx = ctx.template device_context(); + PADDLE_ENFORCE_EQ(op_num, + params_out.size(), + platform::errors::InvalidArgument( + "The size of Output(ParamOut) must be equal to " + "Input(Param), but got the size of Output(ParamOut) " + "is %d, the size of Input(Param) is %d.", + params_out.size(), + op_num)); + PADDLE_ENFORCE_EQ(op_num, + velocity.size(), + platform::errors::InvalidArgument( + "The size of Output(Velocity) must be equal to " + "Input(Param), but got the size of Output(Velocity) " + "is %d, the size of Input(Param) is %d.", + velocity.size(), + op_num)); + PADDLE_ENFORCE_EQ( + op_num, + velocity_out.size(), + platform::errors::InvalidArgument( + "The size of Output(VelocityOut) must be equal to " + "Input(Param), but got the size of Output(VelocityOut) " + "is %d, the size of Input(Param) is %d.", + velocity_out.size(), + op_num)); + PADDLE_ENFORCE_EQ( + op_num, + grad.size(), + platform::errors::InvalidArgument( + "The size of Input(Grad) must be equal to Input(Param), but got " + "the size of Input(Grad) is %d, the size of Input(Param) is %d.", + grad.size(), + op_num)); + int r = xpu::merged_momentum(dev_ctx.x_context(), + param_list, + velocity_list, + grad_list, + param_out_list, + velocity_out_list, + l2_weight_decay, + sizes, + lr->data(), + mu, + use_nesterov); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "merged_momentum"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + merged_momentum, + ops::MergedMomentumOpXPUKernel, + ops::MergedMomentumOpXPUKernel); +#endif diff --git a/paddle/fluid/operators/pool_op_xpu.cc b/paddle/fluid/operators/pool_op_xpu.cc index bfa638e5bde5d..60f05e6d8e74f 100644 --- a/paddle/fluid/operators/pool_op_xpu.cc +++ b/paddle/fluid/operators/pool_op_xpu.cc @@ -44,6 +44,15 @@ class PoolXPUKernel : public framework::OpKernel { platform::errors::InvalidArgument( "The Pool2d XPU OP only support 2 dimension pooling!")); + std::string data_format = context.Attr("data_format"); + PADDLE_ENFORCE_EQ( + data_format, + "NCHW", + platform::errors::InvalidArgument("The Pool2d XPU OP only support" + "data_format is 'NCHW', but received " + "%s", + data_format)); + int* index_data = nullptr; bool global_pooling = context.Attr("global_pooling") || (adaptive && (ksize[0] * ksize[1] == 1)); @@ -173,6 +182,16 @@ class PoolGradXPUKernel : public framework::OpKernel { bool exclusive = context.Attr("exclusive"); bool adaptive = context.Attr("adaptive"); bool ceil_mode = context.Attr("ceil_mode"); + + std::string data_format = context.Attr("data_format"); + PADDLE_ENFORCE_EQ( + data_format, + "NCHW", + platform::errors::InvalidArgument("The Pool2d_grad XPU OP only support" + "data_format is 'NCHW', but received " + "%s", + data_format)); + std::string padding_algorithm = context.Attr("padding_algorithm"); const int* index_data = nullptr; @@ -202,13 +221,6 @@ class PoolGradXPUKernel : public framework::OpKernel { const int out_h = out->dims()[2]; const int out_w = out->dims()[3]; - PADDLE_ENFORCE_EQ(!adaptive || (ksize[0] * ksize[1] == 1) || - (in_h % out_h == 0 && in_w % out_w == 0), - true, - platform::errors::InvalidArgument( - "The Pool2d XPU OP does not support (adaptive == " - "true && output_size != 1)")); - framework::DDim data_dims; data_dims = phi::slice_ddim(in_x->dims(), 2, in_x->dims().size()); @@ -234,7 +246,8 @@ class PoolGradXPUKernel : public framework::OpKernel { auto input_grad = reinterpret_cast(in_x_grad->data()); auto& dev_ctx = context.template device_context(); int r = xpu::Error_t::SUCCESS; - if (adaptive && in_h % out_h == 0 && in_w % out_w == 0) { + if (adaptive) { + // floor for stride strides = {in_h / out_h, in_w / out_w}; int kh = in_h - (out_h - 1) * strides[0]; int kw = in_w - (out_w - 1) * strides[1]; @@ -243,6 +256,7 @@ class PoolGradXPUKernel : public framework::OpKernel { } if (pooling_type == "max") { + // TODO(zhanghuan05) to bind max_pool2d_grad_indices xpu api r = xpu::max_pool2d_grad(dev_ctx.x_context(), input, output, diff --git a/paddle/fluid/operators/prim_ops/CMakeLists.txt b/paddle/fluid/operators/prim_ops/CMakeLists.txt index d29933bc1964a..2583d8cfd9ccb 100644 --- a/paddle/fluid/operators/prim_ops/CMakeLists.txt +++ b/paddle/fluid/operators/prim_ops/CMakeLists.txt @@ -23,7 +23,8 @@ set(PRIM_OP_SRCS sqrt_p_op.cc tanh_p_op.cc matmul_p_op.cc - fill_constant_p_op.cc) + fill_constant_p_op.cc + log_p_op.cc) cc_test( prim_op_test diff --git a/paddle/fluid/operators/prim_ops/log_p_op.cc b/paddle/fluid/operators/prim_ops/log_p_op.cc new file mode 100644 index 0000000000000..199ef0bad36b9 --- /dev/null +++ b/paddle/fluid/operators/prim_ops/log_p_op.cc @@ -0,0 +1,75 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { +class LogPrimOp : public framework::OperatorBase { + public: + LogPrimOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Prim operator log_p should not be excuted directly")); + } +}; + +class LogPrimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of log_p op."); + AddOutput("Y", "(Tensor), The output tensor of log_p op."); + AddComment(R"DOC( +Autograd primitive log_p operator. +)DOC"); + } +}; + +class LogPrimOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0]; + framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0]; + + framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr); + + PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape()); + } +}; + +class LogPrimOpVarTypeInference + : public framework::StaticGraphVarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = Input(ctx, "X")[0]; + auto y_name = Output(ctx, "Y")[0]; + SetType(ctx, y_name, GetType(ctx, x_name)); + SetDataType(ctx, y_name, GetDataType(ctx, x_name)); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(log_p, + paddle::operators::LogPrimOp, + paddle::operators::LogPrimOpMaker, + paddle::operators::LogPrimOpShapeInference, + paddle::operators::LogPrimOpVarTypeInference); diff --git a/paddle/fluid/operators/prim_ops/prim_op_test.cc b/paddle/fluid/operators/prim_ops/prim_op_test.cc index df5de4e1ab42f..5fb7ae823081c 100644 --- a/paddle/fluid/operators/prim_ops/prim_op_test.cc +++ b/paddle/fluid/operators/prim_ops/prim_op_test.cc @@ -34,6 +34,7 @@ USE_OP_ITSELF(sqrt_p); USE_OP_ITSELF(tanh_p); USE_OP_ITSELF(matmul_p); USE_OP_ITSELF(fill_constant_p); +USE_OP_ITSELF(log_p); namespace paddle { namespace framework { @@ -595,5 +596,24 @@ TEST(PrimOp, fill_constant_p) { ASSERT_EQ(shapes[2], 5L); } +TEST(PrimOp, log_p) { + ProgramDesc program; + auto *block = program.MutableBlock(0); + std::vector shape{3, 4, 5}; + + std::string x0 = "x0"; + std::string x1 = "x1"; + + NewVar(block, x0, shape); + AppendOp(block, "log_p", {{"X", {x0}}}, {{"Y", {x1}}}, {}); + ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR); + ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32); + auto shapes = block->Var("x1")->GetShape(); + ASSERT_EQ(shapes.size(), 3UL); + ASSERT_EQ(shapes[0], 3L); + ASSERT_EQ(shapes[1], 4L); + ASSERT_EQ(shapes[2], 5L); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/prim_ops/split_p_op.cc b/paddle/fluid/operators/prim_ops/split_p_op.cc index 4ad9d82467d28..0584de504e770 100644 --- a/paddle/fluid/operators/prim_ops/split_p_op.cc +++ b/paddle/fluid/operators/prim_ops/split_p_op.cc @@ -51,7 +51,7 @@ class SplitPrimOpMaker : public framework::OpProtoAndCheckerMaker { "num_or_sections indicates the number of equal sized sub-Tensors that " "the input will be divided into. If num_or_sections has more then one " "element, the length of it indicates the number of sub-Tensors and the " - "elements in it indicate the sizes of sub-Tensors’ dimension orderly. " + "elements in it indicate the sizes of sub-Tensors' dimension orderly. " "The length of the vector must not be larger than the input's size of " "specified axis."); AddComment(R"DOC( diff --git a/paddle/fluid/operators/spectral_helper.h b/paddle/fluid/operators/spectral_helper.h deleted file mode 100644 index f69573e18927e..0000000000000 --- a/paddle/fluid/operators/spectral_helper.h +++ /dev/null @@ -1,545 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/operators/spectral_op.h" - -#if defined(PADDLE_WITH_ONEMKL) -#include "paddle/phi/backends/dynload/mklrt.h" -#elif defined(PADDLE_WITH_POCKETFFT) -#include "extern_pocketfft/pocketfft_hdronly.h" -#endif - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -// FFT Functors -#if defined(PADDLE_WITH_ONEMKL) - -#define MKL_DFTI_CHECK(expr) \ - do { \ - MKL_LONG status = (expr); \ - if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR)) \ - PADDLE_THROW( \ - platform::errors::External(phi::dynload::DftiErrorMessage(status))); \ - } while (0); - -struct DftiDescriptorDeleter { - void operator()(DFTI_DESCRIPTOR_HANDLE handle) { - if (handle != nullptr) { - MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle)); - } - } -}; - -// A RAII wrapper for MKL_DESCRIPTOR* -class DftiDescriptor { - public: - void init(DFTI_CONFIG_VALUE precision, - DFTI_CONFIG_VALUE signal_type, - MKL_LONG signal_ndim, - MKL_LONG* sizes) { - PADDLE_ENFORCE_EQ(desc_.get(), - nullptr, - platform::errors::AlreadyExists( - "DftiDescriptor has already been initialized.")); - - DFTI_DESCRIPTOR* raw_desc; - MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX( - &raw_desc, precision, signal_type, signal_ndim, sizes)); - desc_.reset(raw_desc); - } - - DFTI_DESCRIPTOR* get() const { - DFTI_DESCRIPTOR* raw_desc = desc_.get(); - PADDLE_ENFORCE_NOT_NULL(raw_desc, - platform::errors::PreconditionNotMet( - "DFTI DESCRIPTOR has not been initialized.")); - return raw_desc; - } - - private: - std::unique_ptr desc_; -}; - -static DftiDescriptor _plan_mkl_fft( - const framework::proto::VarType::Type& in_dtype, - const framework::proto::VarType::Type& out_dtype, - const framework::DDim& in_strides, - const framework::DDim& out_strides, - const std::vector& signal_sizes, - FFTNormMode normalization, - bool forward) { - const DFTI_CONFIG_VALUE precision = [&] { - switch (in_dtype) { - case framework::proto::VarType::FP32: - return DFTI_SINGLE; - case framework::proto::VarType::COMPLEX64: - return DFTI_SINGLE; - case framework::proto::VarType::FP64: - return DFTI_DOUBLE; - case framework::proto::VarType::COMPLEX128: - return DFTI_DOUBLE; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Invalid input datatype (%s), input data type should be FP32, " - "FP64, COMPLEX64 or COMPLEX128.", - framework::DataTypeToString(in_dtype))); - } - }(); - - // C2C, R2C, C2R - const FFTTransformType fft_type = GetFFTTransformType(in_dtype, out_dtype); - const DFTI_CONFIG_VALUE domain = - (fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL; - - DftiDescriptor descriptor; - std::vector fft_sizes(signal_sizes.cbegin(), signal_sizes.cend()); - const MKL_LONG signal_ndim = fft_sizes.size() - 1; - descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1); - - // placement inplace or not inplace - MKL_DFTI_CHECK(phi::dynload::DftiSetValue( - descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE)); - - // number of transformations - const MKL_LONG batch_size = fft_sizes[0]; - MKL_DFTI_CHECK(phi::dynload::DftiSetValue( - descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size)); - - // input & output distance - const MKL_LONG idist = in_strides[0]; - const MKL_LONG odist = out_strides[0]; - MKL_DFTI_CHECK( - phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist)); - MKL_DFTI_CHECK(phi::dynload::DftiSetValue( - descriptor.get(), DFTI_OUTPUT_DISTANCE, odist)); - - // input & output stride - std::vector mkl_in_stride(1 + signal_ndim, 0); - std::vector mkl_out_stride(1 + signal_ndim, 0); - for (MKL_LONG i = 1; i <= signal_ndim; i++) { - mkl_in_stride[i] = in_strides[i]; - mkl_out_stride[i] = out_strides[i]; - } - MKL_DFTI_CHECK(phi::dynload::DftiSetValue( - descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data())); - MKL_DFTI_CHECK(phi::dynload::DftiSetValue( - descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data())); - - // conjugate even storage - if (!(fft_type == FFTTransformType::C2C)) { - MKL_DFTI_CHECK(phi::dynload::DftiSetValue( - descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX)); - } - - MKL_LONG signal_numel = std::accumulate(fft_sizes.cbegin() + 1, - fft_sizes.cend(), - 1UL, - std::multiplies()); - if (normalization != FFTNormMode::none) { - const double scale = - ((normalization == FFTNormMode::by_sqrt_n) - ? 1.0 / std::sqrt(static_cast(signal_numel)) - : 1.0 / static_cast(signal_numel)); - const auto scale_direction = [&]() { - if (fft_type == FFTTransformType::R2C || - (fft_type == FFTTransformType::C2C && forward)) { - return DFTI_FORWARD_SCALE; - } else { - // (fft_type == FFTTransformType::C2R || - // (fft_type == FFTTransformType::C2C && !forward)) - return DFTI_BACKWARD_SCALE; - } - }(); - MKL_DFTI_CHECK( - phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale)); - } - - // commit the descriptor - MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get())); - return descriptor; -} - -// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r) -template -void exec_fft(const DeviceContext& ctx, - const Tensor* x, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - const framework::DDim& in_sizes = x->dims(); - const int ndim = in_sizes.size(); - const int signal_ndim = axes.size(); - const int batch_ndim = ndim - signal_ndim; - const framework::DDim& out_sizes = out->dims(); - - // make a dim permutation - std::vector dim_permute(ndim); - std::iota(dim_permute.begin(), dim_permute.end(), 0); - std::vector is_transformed_dim(ndim, false); - for (const auto& d : axes) { - is_transformed_dim[d] = true; - } - const auto batch_end = - std::partition(dim_permute.begin(), dim_permute.end(), [&](size_t axis) { - return !is_transformed_dim[axis]; - }); - std::copy(axes.cbegin(), axes.cend(), batch_end); - - // transpose input according to that permutation - framework::DDim transposed_input_shape = in_sizes.transpose(dim_permute); - std::vector transposed_input_shape_ = - phi::vectorize(transposed_input_shape); - framework::Tensor transposed_input; - transposed_input.Resize(transposed_input_shape); - const auto place = ctx.GetPlace(); - transposed_input.mutable_data(place); - TransCompute( - ndim, ctx, *x, &transposed_input, dim_permute); - - // make an collapsed input: collapse batch axes for input - const int batch_size = - std::accumulate(transposed_input_shape.Get(), - transposed_input_shape.Get() + batch_ndim, - 1L, - std::multiplies()); - std::vector collapsed_input_shape_(1 + signal_ndim); - collapsed_input_shape_[0] = batch_size; - std::copy(transposed_input_shape_.begin() + batch_ndim, - transposed_input_shape_.end(), - collapsed_input_shape_.begin() + 1); - const framework::DDim collapsed_input_shape = - phi::make_ddim(collapsed_input_shape_); - transposed_input.Resize(collapsed_input_shape); - framework::Tensor& collapsed_input = transposed_input; - - // make a collapsed output - std::vector collapsed_output_shape_(1 + signal_ndim); - collapsed_output_shape_[0] = batch_size; - for (int i = 0; i < signal_ndim; i++) { - collapsed_output_shape_[1 + i] = out_sizes[axes[i]]; - } - const framework::DDim collapsed_output_shape = - phi::make_ddim(collapsed_output_shape_); - framework::Tensor collapsed_output; - collapsed_output.Resize(collapsed_output_shape); - collapsed_output.mutable_data(place, out->type()); - - // signal sizes - std::vector signal_sizes(1 + signal_ndim); - signal_sizes[0] = batch_size; - for (int i = 0; i < signal_ndim; i++) { - signal_sizes[1 + i] = - std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]); - } - - // input & output stride - const framework::DDim input_stride = phi::stride(collapsed_input_shape); - const framework::DDim output_stride = phi::stride(collapsed_output_shape); - - // make a DFTI_DESCRIPTOR - DftiDescriptor desc = - _plan_mkl_fft(framework::TransToProtoVarType(x->dtype()), - framework::TransToProtoVarType(out->dtype()), - input_stride, - output_stride, - signal_sizes, - normalization, - forward); - - const FFTTransformType fft_type = - GetFFTTransformType(framework::TransToProtoVarType(x->dtype()), - framework::TransToProtoVarType(out->type())); - if (fft_type == FFTTransformType::C2R && forward) { - framework::Tensor collapsed_input_conj(collapsed_input.dtype()); - collapsed_input_conj.mutable_data(collapsed_input.dims(), - ctx.GetPlace()); - // conjugate the input - platform::ForRange for_range(ctx, collapsed_input.numel()); - phi::funcs::ConjFunctor functor(collapsed_input.data(), - collapsed_input.numel(), - collapsed_input_conj.data()); - for_range(functor); - MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward( - desc.get(), collapsed_input_conj.data(), collapsed_output.data())); - } else if (fft_type == FFTTransformType::R2C && !forward) { - framework::Tensor collapsed_output_conj(collapsed_output.dtype()); - collapsed_output_conj.mutable_data(collapsed_output.dims(), - ctx.GetPlace()); - MKL_DFTI_CHECK(phi::dynload::DftiComputeForward( - desc.get(), collapsed_input.data(), collapsed_output_conj.data())); - // conjugate the output - platform::ForRange for_range(ctx, collapsed_output.numel()); - phi::funcs::ConjFunctor functor(collapsed_output_conj.data(), - collapsed_output.numel(), - collapsed_output.data()); - for_range(functor); - } else { - if (forward) { - MKL_DFTI_CHECK(phi::dynload::DftiComputeForward( - desc.get(), collapsed_input.data(), collapsed_output.data())); - } else { - MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward( - desc.get(), collapsed_input.data(), collapsed_output.data())); - } - } - - // resize for the collapsed output - framework::DDim transposed_output_shape = out_sizes.transpose(dim_permute); - collapsed_output.Resize(transposed_output_shape); - framework::Tensor& transposed_output = collapsed_output; - - // reverse the transposition - std::vector reverse_dim_permute(ndim); - for (int i = 0; i < ndim; i++) { - reverse_dim_permute[dim_permute[i]] = i; - } - TransCompute( - ndim, ctx, transposed_output, out, reverse_dim_permute); -} - -template -struct FFTC2CFunctor { - void operator()(const phi::CPUContext& ctx, - const Tensor* x, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - exec_fft( - ctx, x, out, axes, normalization, forward); - } -}; - -template -struct FFTR2CFunctor { - void operator()(const phi::CPUContext& ctx, - const Tensor* x, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - exec_fft( - ctx, x, out, axes, normalization, forward); - } -}; - -template -struct FFTC2RFunctor { - void operator()(const phi::CPUContext& ctx, - const Tensor* x, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - if (axes.size() > 1) { - const std::vector c2c_dims(axes.begin(), axes.end() - 1); - Tensor temp; - temp.mutable_data(x->dims(), ctx.GetPlace()); - - FFTC2CFunctor c2c_functor; - c2c_functor(ctx, x, &temp, c2c_dims, normalization, forward); - - const std::vector new_axes{axes.back()}; - exec_fft( - ctx, &temp, out, new_axes, normalization, forward); - } else { - exec_fft( - ctx, x, out, axes, normalization, forward); - } - } -}; -#elif defined(PADDLE_WITH_POCKETFFT) - -template -T compute_factor(int64_t size, FFTNormMode normalization) { - constexpr auto one = static_cast(1); - switch (normalization) { - case FFTNormMode::none: - return one; - case FFTNormMode::by_n: - return one / static_cast(size); - case FFTNormMode::by_sqrt_n: - return one / std::sqrt(static_cast(size)); - } - PADDLE_THROW( - platform::errors::InvalidArgument("Unsupported normalization type")); -} - -template -struct FFTC2CFunctor { - void operator()(const phi::CPUContext& ctx, - const Tensor* x, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - using R = typename Ti::value_type; - using C = std::complex; - - const auto& input_dim = x->dims(); - const std::vector in_sizes = phi::vectorize(input_dim); - std::vector in_strides = - phi::vectorize(phi::stride(input_dim)); - const int64_t data_size = sizeof(C); - std::transform(in_strides.begin(), - in_strides.end(), - in_strides.begin(), - [&](std::ptrdiff_t s) { return s * data_size; }); - - const auto* in_data = reinterpret_cast(x->data()); - auto* out_data = reinterpret_cast(out->data()); - // pocketfft requires std::vector - std::vector axes_(axes.size()); - std::copy(axes.begin(), axes.end(), axes_.begin()); - // compuet factor - int64_t signal_numel = 1; - for (auto i : axes) { - signal_numel *= in_sizes[i]; - } - R factor = compute_factor(signal_numel, normalization); - pocketfft::c2c(in_sizes, - in_strides, - in_strides, - axes_, - forward, - in_data, - out_data, - factor); - } -}; - -template -struct FFTR2CFunctor { - void operator()(const phi::CPUContext& ctx, - const Tensor* x, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - using R = Ti; - using C = std::complex; - - const auto& input_dim = x->dims(); - const std::vector in_sizes = phi::vectorize(input_dim); - std::vector in_strides = - phi::vectorize(phi::stride(input_dim)); - { - const int64_t data_size = sizeof(R); - std::transform(in_strides.begin(), - in_strides.end(), - in_strides.begin(), - [&](std::ptrdiff_t s) { return s * data_size; }); - } - - const auto& output_dim = out->dims(); - const std::vector out_sizes = phi::vectorize(output_dim); - std::vector out_strides = - phi::vectorize(phi::stride(output_dim)); - { - const int64_t data_size = sizeof(C); - std::transform(out_strides.begin(), - out_strides.end(), - out_strides.begin(), - [&](std::ptrdiff_t s) { return s * data_size; }); - } - - const auto* in_data = x->data(); - auto* out_data = reinterpret_cast(out->data()); - // pocketfft requires std::vector - std::vector axes_(axes.size()); - std::copy(axes.begin(), axes.end(), axes_.begin()); - // compuet normalization factor - int64_t signal_numel = 1; - for (auto i : axes) { - signal_numel *= in_sizes[i]; - } - R factor = compute_factor(signal_numel, normalization); - pocketfft::r2c(in_sizes, - in_strides, - out_strides, - axes_, - forward, - in_data, - out_data, - factor); - } -}; - -template -struct FFTC2RFunctor { - void operator()(const phi::CPUContext& ctx, - const Tensor* x, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - using R = To; - using C = std::complex; - - const auto& input_dim = x->dims(); - const std::vector in_sizes = phi::vectorize(input_dim); - std::vector in_strides = - phi::vectorize(phi::stride(input_dim)); - { - const int64_t data_size = sizeof(C); - std::transform(in_strides.begin(), - in_strides.end(), - in_strides.begin(), - [&](std::ptrdiff_t s) { return s * data_size; }); - } - - const auto& output_dim = out->dims(); - const std::vector out_sizes = phi::vectorize(output_dim); - std::vector out_strides = - phi::vectorize(phi::stride(output_dim)); - { - const int64_t data_size = sizeof(R); - std::transform(out_strides.begin(), - out_strides.end(), - out_strides.begin(), - [&](std::ptrdiff_t s) { return s * data_size; }); - } - - const auto* in_data = reinterpret_cast(x->data()); - auto* out_data = out->data(); - // pocketfft requires std::vector - std::vector axes_(axes.size()); - std::copy(axes.begin(), axes.end(), axes_.begin()); - // compuet normalization factor - int64_t signal_numel = 1; - for (auto i : axes) { - signal_numel *= out_sizes[i]; - } - R factor = compute_factor(signal_numel, normalization); - pocketfft::c2r(out_sizes, - in_strides, - out_strides, - axes_, - forward, - in_data, - out_data, - factor); - } -}; - -#endif - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc deleted file mode 100644 index 91e3880dff004..0000000000000 --- a/paddle/fluid/operators/spectral_op.cc +++ /dev/null @@ -1,389 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/spectral_op.h" - -#include "paddle/fluid/operators/spectral_helper.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -// FFTC2C -class FFTC2COpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor), the input tensor of fft_c2c op."); - AddOutput("Out", "(Tensor), the output tensor of fft_c2c op."); - AddAttr>("axes", - "std::vector, the fft axes."); - AddAttr("normalization", - "fft_norm_type, the fft normalization type."); - AddAttr("forward", "bool, the fft direction."); - AddComment(R"DOC( - Compute complex to complex FFT. - )DOC"); - } -}; - -class FFTC2COp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fft_c2c"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "fft_c2c"); - const auto axes = ctx->Attrs().Get>("axes"); - const auto x_dim = ctx->GetInputDim("X"); - for (size_t i = 0; i < axes.size(); i++) { - PADDLE_ENFORCE_GT(x_dim[axes[i]], - 0, - platform::errors::InvalidArgument( - "Invalid fft n-point (%d).", x_dim[axes[i]])); - } - ctx->ShareDim("X", /*->*/ "Out"); // only for c2c - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - const auto kernel_dtype = framework::ToRealType(in_dtype); - return framework::OpKernelType(kernel_dtype, ctx.GetPlace()); - } -}; - -template -class FFTC2CGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("fft_c2c_grad"); - grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -class FFTC2CGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - const auto out_grad_name = framework::GradVarName("Out"); - OP_INOUT_CHECK( - ctx->HasInput(out_grad_name), "Input", out_grad_name, "fft_c2c_grad"); - const auto x_grad_name = framework::GradVarName("X"); - OP_INOUT_CHECK( - ctx->HasOutput(x_grad_name), "Output", x_grad_name, "fft_c2c_grad"); - - ctx->SetOutputDim(x_grad_name, ctx->GetInputDim(out_grad_name)); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - const auto in_dtype = OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")); - const auto kernel_dtype = framework::ToRealType(in_dtype); - return framework::OpKernelType(kernel_dtype, ctx.GetPlace()); - } -}; - -// FFTR2C -class FFTR2COpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor), the input tensor of fft_r2c op."); - AddOutput("Out", "(Tensor), the output tensor of fft_r2c op."); - AddAttr>("axes", - "std::vector, the fft axes."); - AddAttr("normalization", - "fft_norm_type, the fft normalization type."); - AddAttr("forward", "bool, the fft direction."); - AddAttr("onesided", "bool, perform onesided fft."); - AddComment(R"DOC( - Compute real to complex FFT. - )DOC"); - } -}; - -class FFTR2COp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fft_r2c"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "fft_r2c"); - const auto axes = ctx->Attrs().Get>("axes"); - const auto x_dim = ctx->GetInputDim("X"); - for (size_t i = 0; i < axes.size() - 1L; i++) { - PADDLE_ENFORCE_GT(x_dim[axes[i]], - 0, - platform::errors::InvalidArgument( - "Invalid fft n-point (%d).", x_dim[axes[i]])); - } - - const bool onesided = ctx->Attrs().Get("onesided"); - if (!onesided) { - ctx->ShareDim("X", /*->*/ "Out"); - } else { - framework::DDim out_dim(ctx->GetInputDim("X")); - const int64_t last_fft_axis = axes.back(); - const int64_t last_fft_dim_size = out_dim.at(last_fft_axis); - out_dim.at(last_fft_axis) = last_fft_dim_size / 2 + 1; - ctx->SetOutputDim("Out", out_dim); - } - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return framework::OpKernelType(in_dtype, ctx.GetPlace()); - } -}; - -template -class FFTR2CGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("fft_r2c_grad"); - grad_op->SetInput("X", this->Input("X")); - grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -class FFTR2CGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - const auto out_grad_name = framework::GradVarName("Out"); - OP_INOUT_CHECK( - ctx->HasInput(out_grad_name), "Input", out_grad_name, "fft_r2c_grad"); - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fft_r2c_grad"); - - const auto x_grad_name = framework::GradVarName("X"); - OP_INOUT_CHECK( - ctx->HasOutput(x_grad_name), "Output", x_grad_name, "fft_r2c_grad"); - - ctx->ShareDim("X", /*->*/ x_grad_name); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - const auto in_dtype = OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")); - const auto kernel_dtype = framework::ToRealType(in_dtype); - return framework::OpKernelType(kernel_dtype, ctx.GetPlace()); - } -}; - -// FFTC2R -class FFTC2ROpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor), the input tensor of fft_c2r op."); - AddOutput("Out", "(Tensor), the output tensor of fft_c2r op."); - AddAttr>("axes", - "std::vector, the fft axes."); - AddAttr("normalization", - "fft_norm_type, the fft normalization type."); - AddAttr("forward", "bool, the fft direction."); - AddAttr( - "last_dim_size", - "int", - "Length of the transformed " - "axis of the output. For n output points, last_dim_size//2 + 1 input" - " points are necessary. If the input is longer than this," - " it is cropped. If it is shorter than this, it is padded" - " with zeros. If last_dim_size is not given, it is taken to be 2*(m-1)" - " where m is the length of the input along the axis " - "specified by axis.") - .SetDefault(0L); - AddComment(R"DOC( - Compute complex to complex FFT. - )DOC"); - } -}; - -class FFTC2ROp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fft_c2r"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "fft_c2r"); - - const auto axes = ctx->Attrs().Get>("axes"); - const auto x_dim = ctx->GetInputDim("X"); - for (size_t i = 0; i < axes.size() - 1L; i++) { - PADDLE_ENFORCE_GT(x_dim[axes[i]], - 0, - platform::errors::InvalidArgument( - "Invalid fft n-point (%d).", x_dim[axes[i]])); - } - - const int64_t last_dim_size = ctx->Attrs().Get("last_dim_size"); - framework::DDim out_dim(ctx->GetInputDim("X")); - const int64_t last_fft_axis = axes.back(); - if (last_dim_size == 0) { - const int64_t last_fft_dim_size = out_dim.at(last_fft_axis); - const int64_t fft_n_point = (last_fft_dim_size - 1) * 2; - PADDLE_ENFORCE_GT(fft_n_point, - 0, - platform::errors::InvalidArgument( - "Invalid fft n-point (%d).", fft_n_point)); - out_dim.at(last_fft_axis) = fft_n_point; - } else { - PADDLE_ENFORCE_GT(last_dim_size, - 0, - platform::errors::InvalidArgument( - "Invalid fft n-point (%d).", last_dim_size)); - out_dim.at(last_fft_axis) = last_dim_size; - } - ctx->SetOutputDim("Out", out_dim); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - const auto kernel_dtype = framework::ToRealType(in_dtype); - return framework::OpKernelType(kernel_dtype, ctx.GetPlace()); - } -}; - -template -class FFTC2RGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("fft_c2r_grad"); - grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -class FFTC2RGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - const auto out_grad_name = framework::GradVarName("Out"); - OP_INOUT_CHECK( - ctx->HasInput(out_grad_name), "Input", out_grad_name, "fft_c2r_grad"); - - const auto x_grad_name = framework::GradVarName("X"); - OP_INOUT_CHECK( - ctx->HasOutput(x_grad_name), "Output", x_grad_name, "fft_c2r_grad"); - - const auto axes = ctx->Attrs().Get>("axes"); - - const auto out_grad_dim = ctx->GetInputDim(out_grad_name); - framework::DDim x_grad_dim(out_grad_dim); - const int64_t last_fft_axis = axes.back(); - const int64_t last_fft_dim_size = x_grad_dim.at(last_fft_axis); - x_grad_dim.at(last_fft_axis) = last_fft_dim_size / 2 + 1; - ctx->SetOutputDim(x_grad_name, x_grad_dim); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - const auto in_dtype = OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")); - return framework::OpKernelType(in_dtype, ctx.GetPlace()); - } -}; - -// common functions -FFTNormMode get_norm_from_string(const std::string& norm, bool forward) { - if (norm.empty() || norm == "backward") { - return forward ? FFTNormMode::none : FFTNormMode::by_n; - } - - if (norm == "forward") { - return forward ? FFTNormMode::by_n : FFTNormMode::none; - } - - if (norm == "ortho") { - return FFTNormMode::by_sqrt_n; - } - - PADDLE_THROW(platform::errors::InvalidArgument( - "FFT norm string must be 'forward' or 'backward' or 'ortho', " - "received %s", - norm)); -} - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR(fft_c2c, - ops::FFTC2COp, - ops::FFTC2COpMaker, - ops::FFTC2CGradOpMaker, - ops::FFTC2CGradOpMaker); -REGISTER_OP_CPU_KERNEL(fft_c2c, - ops::FFTC2CKernel, - ops::FFTC2CKernel); - -REGISTER_OPERATOR(fft_c2c_grad, ops::FFTC2CGradOp); -REGISTER_OP_CPU_KERNEL(fft_c2c_grad, - ops::FFTC2CGradKernel, - ops::FFTC2CGradKernel); - -REGISTER_OPERATOR(fft_r2c, - ops::FFTR2COp, - ops::FFTR2COpMaker, - ops::FFTR2CGradOpMaker, - ops::FFTR2CGradOpMaker); -REGISTER_OP_CPU_KERNEL(fft_r2c, - ops::FFTR2CKernel, - ops::FFTR2CKernel); - -REGISTER_OPERATOR(fft_r2c_grad, ops::FFTR2CGradOp); -REGISTER_OP_CPU_KERNEL(fft_r2c_grad, - ops::FFTR2CGradKernel, - ops::FFTR2CGradKernel); - -REGISTER_OPERATOR(fft_c2r, - ops::FFTC2ROp, - ops::FFTC2ROpMaker, - ops::FFTC2RGradOpMaker, - ops::FFTC2RGradOpMaker); -REGISTER_OP_CPU_KERNEL(fft_c2r, - ops::FFTC2RKernel, - ops::FFTC2RKernel); - -REGISTER_OPERATOR(fft_c2r_grad, ops::FFTC2RGradOp); -REGISTER_OP_CPU_KERNEL(fft_c2r_grad, - ops::FFTC2RGradKernel, - ops::FFTC2RGradKernel); diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu deleted file mode 100644 index 661fcc83771f5..0000000000000 --- a/paddle/fluid/operators/spectral_op.cu +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/fluid/operators/spectral_op.h" -#include "paddle/fluid/operators/spectral_op.cu.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(fft_c2c, - ops::FFTC2CKernel, - ops::FFTC2CKernel); - -REGISTER_OP_CUDA_KERNEL(fft_c2c_grad, - ops::FFTC2CGradKernel, - ops::FFTC2CGradKernel); - -REGISTER_OP_CUDA_KERNEL(fft_c2r, - ops::FFTC2RKernel, - ops::FFTC2RKernel); - -REGISTER_OP_CUDA_KERNEL(fft_c2r_grad, - ops::FFTC2RGradKernel, - ops::FFTC2RGradKernel); - -REGISTER_OP_CUDA_KERNEL(fft_r2c, - ops::FFTR2CKernel, - ops::FFTR2CKernel); - -REGISTER_OP_CUDA_KERNEL(fft_r2c_grad, - ops::FFTR2CGradKernel, - ops::FFTR2CGradKernel); diff --git a/paddle/fluid/operators/spectral_op.cu.h b/paddle/fluid/operators/spectral_op.cu.h deleted file mode 100644 index 5ced67691ee07..0000000000000 --- a/paddle/fluid/operators/spectral_op.cu.h +++ /dev/null @@ -1,1018 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/operators/spectral_op.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/dynload/hipfft.h" -#endif - -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/dynload/cufft.h" -#endif - -namespace paddle { -namespace operators { - -using ScalarType = framework::proto::VarType::Type; -const int64_t kMaxFFTNdim = 3; -const int64_t kMaxDataNdim = kMaxFFTNdim + 1; -// This struct is used to easily compute hashes of the -// parameters. It will be the **key** to the plan cache. -struct FFTConfigKey { - // between 1 and kMaxFFTNdim, i.e., 1 <= signal_ndim <= 3 - int64_t signal_ndim_; - // These include additional batch dimension as well. - int64_t sizes_[kMaxDataNdim]; - int64_t input_shape_[kMaxDataNdim]; - int64_t output_shape_[kMaxDataNdim]; - FFTTransformType fft_type_; - ScalarType value_type_; - - FFTConfigKey() = default; - - FFTConfigKey(const std::vector& in_shape, - const std::vector& out_shape, - const std::vector& signal_size, - FFTTransformType fft_type, - ScalarType value_type) { - // Padding bits must be zeroed for hashing - memset(this, 0, sizeof(*this)); - signal_ndim_ = signal_size.size() - 1; - fft_type_ = fft_type; - value_type_ = value_type; - - std::copy(signal_size.cbegin(), signal_size.cend(), sizes_); - std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_); - std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_); - } -}; - -#if defined(PADDLE_WITH_CUDA) -// An RAII encapsulation of cuFFTHandle -class CuFFTHandle { - ::cufftHandle handle_; - - public: - CuFFTHandle() { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftCreate(&handle_)); - } - - CuFFTHandle(const CuFFTHandle& other) = delete; - CuFFTHandle& operator=(const CuFFTHandle& other) = delete; - - CuFFTHandle(CuFFTHandle&& other) = delete; - CuFFTHandle& operator=(CuFFTHandle&& other) = delete; - - ::cufftHandle& get() { return handle_; } - const ::cufftHandle& get() const { return handle_; } - - ~CuFFTHandle() { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftDestroy(handle_)); - } -}; - -using plan_size_type = long long int; // NOLINT -// This class contains all the information needed to execute a cuFFT plan: -// 1. the plan -// 2. the workspace size needed -class FFTConfig { - public: - // Only move semantics is enought for this class. Although we already use - // unique_ptr for the plan, still remove copy constructor and assignment op so - // we don't accidentally copy and take perf hit. - explicit FFTConfig(const FFTConfigKey& plan_key) - : FFTConfig( - std::vector(plan_key.sizes_, - plan_key.sizes_ + plan_key.signal_ndim_ + 1), - plan_key.signal_ndim_, - plan_key.fft_type_, - plan_key.value_type_) {} - - // sizes are full signal, including batch size and always two-sided - FFTConfig(const std::vector& sizes, - const int64_t signal_ndim, - FFTTransformType fft_type, - ScalarType dtype) - : fft_type_(fft_type), value_type_(dtype) { - // signal sizes (excluding batch dim) - std::vector signal_sizes(sizes.begin() + 1, sizes.end()); - - // input batch size - const auto batch = static_cast(sizes[0]); - // const int64_t signal_ndim = sizes.size() - 1; - PADDLE_ENFORCE_EQ(signal_ndim, - sizes.size() - 1, - platform::errors::InvalidArgument( - "The signal_ndim must be equal to sizes.size() - 1," - "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]", - signal_ndim, - sizes.size() - 1)); - - cudaDataType itype, otype, exec_type; - const auto complex_input = has_complex_input(fft_type); - const auto complex_output = has_complex_output(fft_type); - if (dtype == framework::proto::VarType::FP32) { - itype = complex_input ? CUDA_C_32F : CUDA_R_32F; - otype = complex_output ? CUDA_C_32F : CUDA_R_32F; - exec_type = CUDA_C_32F; - } else if (dtype == framework::proto::VarType::FP64) { - itype = complex_input ? CUDA_C_64F : CUDA_R_64F; - otype = complex_output ? CUDA_C_64F : CUDA_R_64F; - exec_type = CUDA_C_64F; - } else if (dtype == framework::proto::VarType::FP16) { - itype = complex_input ? CUDA_C_16F : CUDA_R_16F; - otype = complex_output ? CUDA_C_16F : CUDA_R_16F; - exec_type = CUDA_C_16F; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "cuFFT only support transforms of type float16, float32 and " - "float64")); - } - - // disable auto allocation of workspace to use allocator from the framework - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetAutoAllocation( - plan(), /* autoAllocate */ 0)); - - size_t ws_size_t; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cufftXtMakePlanMany(plan(), - signal_ndim, - signal_sizes.data(), - /* inembed */ nullptr, - /* base_istride */ 1, - /* idist */ 1, - itype, - /* onembed */ nullptr, - /* base_ostride */ 1, - /* odist */ 1, - otype, - batch, - &ws_size_t, - exec_type)); - - ws_size = ws_size_t; - } - - FFTConfig(const FFTConfig& other) = delete; - FFTConfig& operator=(const FFTConfig& other) = delete; - - FFTConfig(FFTConfig&& other) = delete; - FFTConfig& operator=(FFTConfig&& other) = delete; - - const cufftHandle& plan() const { return plan_ptr.get(); } - - FFTTransformType transform_type() const { return fft_type_; } - ScalarType data_type() const { return value_type_; } - size_t workspace_size() const { return ws_size; } - - private: - CuFFTHandle plan_ptr; - size_t ws_size; - FFTTransformType fft_type_; - ScalarType value_type_; -}; - -#elif defined(PADDLE_WITH_HIP) -// An RAII encapsulation of cuFFTHandle -class HIPFFTHandle { - ::hipfftHandle handle_; - - public: - HIPFFTHandle() { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftCreate(&handle_)); - } - - HIPFFTHandle(const HIPFFTHandle& other) = delete; - HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete; - - HIPFFTHandle(HIPFFTHandle&& other) = delete; - HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete; - - ::hipfftHandle& get() { return handle_; } - const ::hipfftHandle& get() const { return handle_; } - - ~HIPFFTHandle() { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftDestroy(handle_)); - } -}; -using plan_size_type = int; -// This class contains all the information needed to execute a cuFFT plan: -// 1. the plan -// 2. the workspace size needed -class FFTConfig { - public: - // Only move semantics is enought for this class. Although we already use - // unique_ptr for the plan, still remove copy constructor and assignment op so - // we don't accidentally copy and take perf hit. - explicit FFTConfig(const FFTConfigKey& plan_key) - : FFTConfig( - std::vector(plan_key.sizes_, - plan_key.sizes_ + plan_key.signal_ndim_ + 1), - plan_key.signal_ndim_, - plan_key.fft_type_, - plan_key.value_type_) {} - - // sizes are full signal, including batch size and always two-sided - FFTConfig(const std::vector& sizes, - const int64_t signal_ndim, - FFTTransformType fft_type, - ScalarType dtype) - : fft_type_(fft_type), value_type_(dtype) { - // signal sizes (excluding batch dim) - std::vector signal_sizes(sizes.begin() + 1, sizes.end()); - - // input batch size - const auto batch = static_cast(sizes[0]); - // const int64_t signal_ndim = sizes.size() - 1; - PADDLE_ENFORCE_EQ(signal_ndim, - sizes.size() - 1, - platform::errors::InvalidArgument( - "The signal_ndim must be equal to sizes.size() - 1," - "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]", - signal_ndim, - sizes.size() - 1)); - - hipfftType exec_type = [&] { - if (dtype == framework::proto::VarType::FP32) { - switch (fft_type) { - case FFTTransformType::C2C: - return HIPFFT_C2C; - case FFTTransformType::R2C: - return HIPFFT_R2C; - case FFTTransformType::C2R: - return HIPFFT_C2R; - } - } else if (dtype == framework::proto::VarType::FP64) { - switch (fft_type) { - case FFTTransformType::C2C: - return HIPFFT_Z2Z; - case FFTTransformType::R2C: - return HIPFFT_D2Z; - case FFTTransformType::C2R: - return HIPFFT_Z2D; - } - } - PADDLE_THROW(platform::errors::InvalidArgument( - "hipFFT only support transforms of type float32 and float64")); - }(); - - // disable auto allocation of workspace to use allocator from the framework - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetAutoAllocation( - plan(), /* autoAllocate */ 0)); - - size_t ws_size_t; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::hipfftMakePlanMany(plan(), - signal_ndim, - signal_sizes.data(), - /* inembed */ nullptr, - /* base_istride */ 1, - /* idist */ 1, - /* onembed */ nullptr, - /* base_ostride */ 1, - /* odist */ 1, - exec_type, - batch, - &ws_size_t)); - - ws_size = ws_size_t; - } - - const hipfftHandle& plan() const { return plan_ptr.get(); } - - FFTTransformType transform_type() const { return fft_type_; } - ScalarType data_type() const { return value_type_; } - size_t workspace_size() const { return ws_size; } - - private: - HIPFFTHandle plan_ptr; - size_t ws_size; - FFTTransformType fft_type_; - ScalarType value_type_; -}; -#endif - -// Hashing machinery for Key -// Fowler–Noll–Vo hash function -// see -// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function -template -struct KeyHash { - // Key must be a POD because we read out its memory - // contenst as char* when hashing - static_assert(std::is_pod::value, "Key must be plain old data type"); - - size_t operator()(const Key& params) const { - auto ptr = reinterpret_cast(¶ms); - uint32_t value = 0x811C9DC5; - for (int i = 0; i < static_cast(sizeof(Key)); ++i) { - value ^= ptr[i]; - value *= 0x01000193; - } - return static_cast(value); - } -}; - -template -struct KeyEqual { - // Key must be a POD because we read out its memory - // contenst as char* when comparing - static_assert(std::is_pod::value, "Key must be plain old data type"); - - bool operator()(const Key& a, const Key& b) const { - auto ptr1 = reinterpret_cast(&a); - auto ptr2 = reinterpret_cast(&b); - return memcmp(ptr1, ptr2, sizeof(Key)) == 0; - } -}; - -#if CUDA_VERSION < 10000 -// Note that the max plan number for CUDA version < 10 has to be 1023 -// due to a bug that fails on the 1024th plan -constexpr size_t CUFFT_MAX_PLAN_NUM = 1023; -constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM; -#else -constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits::max(); -// The default max cache size chosen for CUDA version > 10 is arbitrary. -// This number puts a limit on how big of a plan cache should we maintain by -// default. Users can always configure it via cufft_set_plan_cache_max_size. -constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096; -#endif -static_assert(CUFFT_MAX_PLAN_NUM >= 0 && - CUFFT_MAX_PLAN_NUM <= std::numeric_limits::max(), - "CUFFT_MAX_PLAN_NUM not in size_t range"); -static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 && - CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM, - "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range"); - -// This cache assumes that the mapping from key to value never changes. -// This is **NOT** thread-safe. Please use a mutex when using it **AND** the -// value returned from try_emplace_value. -// The contract of using this cache is that try_emplace_value should only be -// used when the max_size is positive. -class FFTConfigCache { - public: - using kv_t = typename std::pair; - using map_t = - typename std::unordered_map, - typename std::list::iterator, - KeyHash, - KeyEqual>; - using map_kkv_iter_t = typename map_t::iterator; - - FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {} - - explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); } - - FFTConfigCache(const FFTConfigCache& other) = delete; - FFTConfigCache& operator=(const FFTConfigCache& other) = delete; - - FFTConfigCache(FFTConfigCache&& other) noexcept - : _usage_list(std::move(other._usage_list)), - _cache_map(std::move(other._cache_map)), - _max_size(other._max_size) {} - - FFTConfigCache& operator=(FFTConfigCache&& other) noexcept { - _usage_list = std::move(other._usage_list); - _cache_map = std::move(other._cache_map); - _max_size = other._max_size; - return *this; - } - - // If key is in this cache, return the cached config. Otherwise, emplace the - // config in this cache and return it. - FFTConfig& lookup(FFTConfigKey params) { - PADDLE_ENFORCE_GT(_max_size, - 0, - platform::errors::InvalidArgument( - "The max size of FFTConfigCache must be great than 0," - "But received is [%d]", - _max_size)); - - map_kkv_iter_t map_it = _cache_map.find(params); - // Hit, put to list front - if (map_it != _cache_map.end()) { - _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second); - return map_it->second->second; - } - - // Miss - // remove if needed - if (_usage_list.size() >= _max_size) { - auto last = _usage_list.end(); - last--; - _cache_map.erase(last->first); - _usage_list.pop_back(); - } - - // construct new plan at list front, then insert into _cache_map - _usage_list.emplace_front(std::piecewise_construct, - std::forward_as_tuple(params), - std::forward_as_tuple(params)); - auto kv_it = _usage_list.begin(); - _cache_map.emplace(std::piecewise_construct, - std::forward_as_tuple(kv_it->first), - std::forward_as_tuple(kv_it)); - return kv_it->second; - } - - void clear() { - _cache_map.clear(); - _usage_list.clear(); - } - - void resize(int64_t new_size) { - _set_max_size(new_size); - auto cur_size = _usage_list.size(); - if (cur_size > _max_size) { - auto delete_it = _usage_list.end(); - for (size_t i = 0; i < cur_size - _max_size; i++) { - delete_it--; - _cache_map.erase(delete_it->first); - } - _usage_list.erase(delete_it, _usage_list.end()); - } - } - - size_t size() const { return _cache_map.size(); } - - size_t max_size() const noexcept { return _max_size; } - - std::mutex mutex; - - private: - // Only sets size and does value check. Does not resize the data structures. - void _set_max_size(int64_t new_size) { - // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since - // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check - // first. - PADDLE_ENFORCE_GE( - new_size, - 0, - platform::errors::InvalidArgument( - "cuFFT plan cache size must be non-negative, But received is [%d]", - new_size)); - PADDLE_ENFORCE_LE(new_size, - CUFFT_MAX_PLAN_NUM, - platform::errors::InvalidArgument( - "cuFFT plan cache size can not be larger than [%d], " - "But received is [%d]", - CUFFT_MAX_PLAN_NUM, - new_size)); - _max_size = static_cast(new_size); - } - - std::list _usage_list; - map_t _cache_map; - size_t _max_size; -}; - -static std::vector> plan_caches; -static std::mutex plan_caches_mutex; - -static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) { - std::lock_guard guard(plan_caches_mutex); - - if (device_index >= plan_caches.size()) { - plan_caches.resize(device_index + 1); - } - - if (!plan_caches[device_index]) { - plan_caches[device_index] = std::make_unique(); - } - - return *plan_caches[device_index]; -} - -// Calculates the normalization constant -static double fft_normalization_scale(FFTNormMode normalization, - const std::vector& sizes, - const std::vector& dims) { - // auto norm = static_cast(normalization); - if (normalization == FFTNormMode::none) { - return static_cast(1.0); - } - - int64_t signal_numel = 1; - for (auto dim : dims) { - signal_numel *= sizes[dim]; - } - const double scale_denom = (normalization == FFTNormMode::by_sqrt_n) - ? std::sqrt(signal_numel) - : static_cast(signal_numel); - return static_cast(1.0 / scale_denom); -} - -template -void exec_normalization(const DeviceContext& ctx, - const Tensor* in, - Tensor* out, - FFTNormMode normalization, - const std::vector& sizes, - const std::vector& axes) { - double scale = fft_normalization_scale(normalization, sizes, axes); - if (scale != 1.0) { - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto dev = ctx.eigen_device(); - EigenScale::Eval(*dev, - eigen_out, - eigen_in, - static_cast(scale), - static_cast(0), - false); - } else { - framework::TensorCopy(*in, ctx.GetPlace(), out); - } -} - -#if defined(PADDLE_WITH_CUDA) -static FFTConfigKey create_fft_configkey(const framework::Tensor& input, - const framework::Tensor& output, - int signal_ndim) { - // Create the transform plan (either from cache or locally) - const auto value_type = - framework::IsComplexType(framework::TransToProtoVarType(input.dtype())) - ? framework::ToRealType(framework::TransToProtoVarType(input.dtype())) - : framework::TransToProtoVarType(input.dtype()); - auto fft_type = - GetFFTTransformType(framework::TransToProtoVarType(input.dtype()), - framework::TransToProtoVarType(output.dtype())); - // signal sizes - std::vector signal_size(signal_ndim + 1); - - signal_size[0] = input.dims()[0]; - for (int64_t i = 1; i <= signal_ndim; ++i) { - auto in_size = input.dims()[i]; - auto out_size = output.dims()[i]; - signal_size[i] = std::max(in_size, out_size); - } - FFTConfigKey key(phi::vectorize(input.dims()), - phi::vectorize(output.dims()), - signal_size, - fft_type, - value_type); - return key; -} - -// Execute a pre-planned transform -static void exec_cufft_plan_raw(const FFTConfig& config, - void* in_data, - void* out_data, - bool forward) { - auto& plan = config.plan(); - - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtExec( - plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE)); -} - -template -void exec_cufft_plan(const DeviceContext& ctx, - const FFTConfig& config, - framework::Tensor* input, - framework::Tensor* output, - bool forward) { - // execute transform plan - auto fft_type = config.transform_type(); - if (fft_type == FFTTransformType::C2R && forward) { - forward = false; - framework::Tensor input_conj(input->type()); - input_conj.mutable_data(input->dims(), ctx.GetPlace()); - platform::ForRange for_range(ctx, input->numel()); - phi::funcs::ConjFunctor functor( - input->data(), input->numel(), input_conj.data()); - for_range(functor); - exec_cufft_plan_raw(config, input_conj.data(), output->data(), forward); - } else if (fft_type == FFTTransformType::R2C && !forward) { - forward = true; - framework::Tensor out_conj(output->type()); - out_conj.mutable_data(output->dims(), ctx.GetPlace()); - exec_cufft_plan_raw(config, input->data(), out_conj.data(), forward); - - platform::ForRange for_range(ctx, output->numel()); - phi::funcs::ConjFunctor functor( - out_conj.data(), output->numel(), output->data()); - for_range(functor); - } else { - exec_cufft_plan_raw(config, input->data(), output->data(), forward); - } -} - -#elif defined(PADDLE_WITH_HIP) - -static FFTConfigKey create_fft_configkey(const framework::Tensor& input, - const framework::Tensor& output, - int signal_ndim) { - // Create the transform plan (either from cache or locally) - const auto value_type = - framework::IsComplexType(framework::TransToProtoVarType(input.dtype())) - ? framework::ToRealType(framework::TransToProtoVarType(input.dtype())) - : framework::TransToProtoVarType(input.dtype()); - auto fft_type = - GetFFTTransformType(framework::TransToProtoVarType(input.dtype()), - framework::TransToProtoVarType(output.type())); - // signal sizes - std::vector signal_size(signal_ndim + 1); - - signal_size[0] = input.dims()[0]; - for (int64_t i = 1; i <= signal_ndim; ++i) { - auto in_size = input.dims()[i]; - auto out_size = output.dims()[i]; - signal_size[i] = std::max(in_size, out_size); - } - FFTConfigKey key(phi::vectorize(input.dims()), - phi::vectorize(output.dims()), - signal_size, - fft_type, - value_type); - return key; -} - -// Execute a pre-planned transform -static void exec_hipfft_plan_raw(const FFTConfig& config, - void* in_data, - void* out_data, - bool forward) { - auto& plan = config.plan(); - - auto value_type = config.data_type(); - if (value_type == framework::proto::VarType::FP32) { - switch (config.transform_type()) { - case FFTTransformType::C2C: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2C( - plan, - static_cast(in_data), - static_cast(out_data), - forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); - return; - } - case FFTTransformType::R2C: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecR2C( - plan, - static_cast(in_data), - static_cast(out_data))); - return; - } - case FFTTransformType::C2R: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2R( - plan, - static_cast(in_data), - static_cast(out_data))); - return; - } - } - } else if (value_type == framework::proto::VarType::FP64) { - switch (config.transform_type()) { - case FFTTransformType::C2C: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2Z( - plan, - static_cast(in_data), - static_cast(out_data), - forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); - return; - } - case FFTTransformType::R2C: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecD2Z( - plan, - static_cast(in_data), - static_cast(out_data))); - return; - } - case FFTTransformType::C2R: { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2D( - plan, - static_cast(in_data), - static_cast(out_data))); - return; - } - } - } - PADDLE_THROW(platform::errors::InvalidArgument( - "hipFFT only support transforms of type float32 and float64")); -} - -template -void exec_hipfft_plan(const DeviceContext& ctx, - const FFTConfig& config, - framework::Tensor* input, - framework::Tensor* output, - bool forward) { - auto fft_type = config.transform_type(); - if (fft_type == FFTTransformType::C2R && forward) { - forward = false; - framework::Tensor input_conj(input->type()); - input_conj.mutable_data(input->dims(), ctx.GetPlace()); - platform::ForRange for_range(ctx, input->numel()); - phi::funcs::ConjFunctor functor( - input->data(), input->numel(), input_conj.data()); - for_range(functor); - exec_hipfft_plan_raw(config, input_conj.data(), output->data(), forward); - } else if (fft_type == FFTTransformType::R2C && !forward) { - forward = true; - framework::Tensor out_conj(output->type()); - out_conj.mutable_data(output->dims(), ctx.GetPlace()); - exec_hipfft_plan_raw(config, input->data(), out_conj.data(), forward); - - platform::ForRange for_range(ctx, output->numel()); - phi::funcs::ConjFunctor functor( - out_conj.data(), output->numel(), output->data()); - for_range(functor); - } else { - exec_hipfft_plan_raw(config, input->data(), output->data(), forward); - } -} - -#endif - -// Execute a general unnormalized fft operation (can be c2c, onesided r2c or -// onesided c2r) -template -void exec_fft(const DeviceContext& ctx, - const Tensor* X, - Tensor* out, - const std::vector& dim, - bool forward) { - const auto x_dims = phi::vectorize(X->dims()); - const int64_t ndim = static_cast(X->dims().size()); - auto tensor_place = ctx.GetPlace(); - - // make a dim permutation - std::vector dim_permute(ndim); - std::iota(dim_permute.begin(), dim_permute.end(), int{0}); - std::vector is_transformed_dim(ndim); - for (const auto& d : dim) { - is_transformed_dim[d] = true; - } - auto batch_end = - std::partition(dim_permute.begin(), dim_permute.end(), [&](int64_t d) { - return !is_transformed_dim[d]; - }); - std::sort(dim_permute.begin(), batch_end); - std::copy(dim.cbegin(), dim.cend(), batch_end); - - // transpose input according to dim permutation - auto transposed_input_shape = X->dims().transpose(dim_permute); - framework::Tensor transposed_input; - transposed_input.Resize(transposed_input_shape); - transposed_input.mutable_data(tensor_place); - TransCompute( - ndim, ctx, *X, &transposed_input, dim_permute); - - // Reshape batch dimensions into a single dimension - const int64_t signal_ndim = static_cast(dim.size()); - std::vector collapsed_input_shape(signal_ndim + 1); - - auto transposed_input_shape_ = phi::vectorize(transposed_input_shape); - const int64_t batch_dims = ndim - signal_ndim; - auto batch_size = - std::accumulate(transposed_input_shape_.begin(), - transposed_input_shape_.begin() + batch_dims, - static_cast(1), - std::multiplies()); - collapsed_input_shape[0] = batch_size; - - std::copy(transposed_input_shape_.begin() + batch_dims, - transposed_input_shape_.end(), - collapsed_input_shape.begin() + 1); - - framework::Tensor& collapsed_input = transposed_input; - collapsed_input.Resize(phi::make_ddim(collapsed_input_shape)); - - // make a collpased output - const auto out_dims = phi::vectorize(out->dims()); - std::vector collapsed_output_shape(1 + signal_ndim); - collapsed_output_shape[0] = batch_size; - for (size_t i = 0; i < dim.size(); ++i) { - collapsed_output_shape[i + 1] = out_dims[dim[i]]; - } - framework::Tensor collapsed_output; - collapsed_output.Resize(phi::make_ddim(collapsed_output_shape)); - collapsed_output.mutable_data(tensor_place); - - FFTConfig* config = nullptr; - -#if defined(PADDLE_WITH_CUDA) - std::unique_ptr config_ = nullptr; - // create plan - FFTConfigKey key = - create_fft_configkey(collapsed_input, collapsed_output, signal_ndim); - bool using_cache = false; -#if !defined(CUFFT_VERSION) || (CUFFT_VERSION < 10200) - using_cache = true; -#endif - - if (using_cache) { - const int64_t device_id = static_cast( - reinterpret_cast(&collapsed_input.place()) - ->GetDeviceId()); - FFTConfigCache& plan_cache = get_fft_plan_cache(device_id); - std::unique_lock guard(plan_cache.mutex, std::defer_lock); - guard.lock(); - config = &(plan_cache.lookup(key)); - } else { - config_ = std::make_unique(key); - config = config_.get(); - } - - // prepare cufft for execution - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cufftSetStream(config->plan(), ctx.stream())); - framework::Tensor workspace_tensor; - workspace_tensor.mutable_data(tensor_place, config->workspace_size()); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetWorkArea( - config->plan(), workspace_tensor.data())); - // execute transform plan - exec_cufft_plan( - ctx, *config, &collapsed_input, &collapsed_output, forward); - -#elif defined(PADDLE_WITH_HIP) - // create plan - FFTConfigKey key = - create_fft_configkey(collapsed_input, collapsed_output, signal_ndim); - const int64_t device_id = static_cast( - reinterpret_cast(&collapsed_input.place()) - ->GetDeviceId()); - FFTConfigCache& plan_cache = get_fft_plan_cache(device_id); - std::unique_lock guard(plan_cache.mutex, std::defer_lock); - guard.lock(); - config = &(plan_cache.lookup(key)); - - // prepare cufft for execution - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::hipfftSetStream(config->plan(), ctx.stream())); - framework::Tensor workspace_tensor; - workspace_tensor.mutable_data(tensor_place, config->workspace_size()); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetWorkArea( - config->plan(), workspace_tensor.data())); - // execute transform plan - exec_hipfft_plan( - ctx, *config, &collapsed_input, &collapsed_output, forward); -#endif - - // Inverting output by reshape and transpose to original batch and dimension - auto transposed_out_shape = out->dims().transpose(dim_permute); - - collapsed_output.Resize(transposed_out_shape); - auto& transposed_output = collapsed_output; - - std::vector reverse_dim_permute(ndim); - for (size_t i = 0; i < ndim; i++) { - reverse_dim_permute[dim_permute[i]] = i; - } - - TransCompute( - ndim, ctx, transposed_output, out, reverse_dim_permute); -} - -// Use the optimized path to perform single R2C or C2R if transformation dim is -// supported by cuFFT -static bool use_optimized_fft_path(const std::vector& axes) { - // For performance reason, when axes starts with (0, 1), do not use the - // optimized path. - if (axes.size() > kMaxFFTNdim || - (axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) { - return false; - } else { - return true; - } -} - -template -struct FFTC2CFunctor { - void operator()(const phi::GPUContext& ctx, - const Tensor* X, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - if (axes.empty()) { - framework::TensorCopy(*X, ctx.GetPlace(), out); - return; - } - - framework::Tensor* p_out = out; - std::vector out_dims = phi::vectorize(X->dims()); - std::vector working_axes(axes.begin(), axes.end()); - std::vector first_dims; - size_t max_dims; - framework::Tensor working_tensor; - working_tensor.mutable_data(X->dims(), ctx.GetPlace()); - framework::Tensor* p_working_tensor = &working_tensor; - framework::TensorCopy(*X, ctx.GetPlace(), &working_tensor); - - while (true) { - max_dims = - std::min(static_cast(kMaxFFTNdim), working_axes.size()); - first_dims.assign(working_axes.end() - max_dims, working_axes.end()); - - exec_fft( - ctx, p_working_tensor, p_out, first_dims, forward); - working_axes.resize(working_axes.size() - max_dims); - first_dims.clear(); - - if (working_axes.empty()) { - break; - } - - std::swap(p_out, p_working_tensor); - } - exec_normalization( - ctx, p_out, out, normalization, out_dims, axes); - } -}; - -template -struct FFTC2RFunctor { - void operator()(const phi::GPUContext& ctx, - const Tensor* X, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - std::vector in_dims = phi::vectorize(X->dims()); - std::vector out_dims = phi::vectorize(out->dims()); - - if (use_optimized_fft_path(axes)) { - framework::Tensor x_copy(X->type()); - x_copy.mutable_data(X->dims(), ctx.GetPlace()); - framework::TensorCopy(*X, ctx.GetPlace(), &x_copy); - exec_fft(ctx, &x_copy, out, axes, forward); - } else { - framework::Tensor temp_tensor; - temp_tensor.mutable_data(X->dims(), ctx.GetPlace()); - const std::vector dims(axes.begin(), axes.end() - 1); - - FFTC2CFunctor c2c_functor; - c2c_functor(ctx, X, &temp_tensor, dims, FFTNormMode::none, forward); - - exec_fft( - ctx, &temp_tensor, out, {axes.back()}, forward); - } - exec_normalization( - ctx, out, out, normalization, out_dims, axes); - } -}; - -// n dimension real to complex FFT use cufft lib -template -struct FFTR2CFunctor { - void operator()(const phi::GPUContext& ctx, - const Tensor* X, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward) { - // Step1: R2C transform on the last dimension - framework::Tensor* r2c_out = out; - const std::vector last_dim{axes.back()}; - std::vector out_dims = phi::vectorize(out->dims()); - exec_fft(ctx, X, r2c_out, last_dim, forward); - - // Step2: C2C transform on the remaining dimension - framework::Tensor c2c_out; - if (axes.size() > 1) { - c2c_out.mutable_data(out->dims(), ctx.GetPlace()); - std::vector remain_dim(axes.begin(), axes.end() - 1); - FFTC2CFunctor fft_c2c_func; - fft_c2c_func( - ctx, r2c_out, &c2c_out, remain_dim, FFTNormMode::none, forward); - } - - const auto in_sizes = phi::vectorize(X->dims()); - framework::Tensor* norm_tensor = axes.size() > 1 ? &c2c_out : r2c_out; - exec_normalization( - ctx, norm_tensor, out, normalization, in_sizes, axes); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/spectral_op.h b/paddle/fluid/operators/spectral_op.h deleted file mode 100644 index 9296f997584dd..0000000000000 --- a/paddle/fluid/operators/spectral_op.h +++ /dev/null @@ -1,507 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once -#define NOMINMAX // to use std::min std::max correctly on windows -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" -#include "paddle/phi/kernels/funcs/padding.h" -#if defined(__NVCC__) || defined(__HIPCC__) -#include "thrust/device_vector.h" -#endif - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -enum class FFTNormMode : int64_t { - none, // No normalization - by_sqrt_n, // Divide by sqrt(signal_size) - by_n, // Divide by signal_size -}; - -FFTNormMode get_norm_from_string(const std::string& norm, bool forward); - -// Enum representing the FFT type -enum class FFTTransformType : int64_t { - C2C = 0, // Complex-to-complex - R2C, // Real-to-complex - C2R, // Complex-to-real -}; - -// Create transform type enum from bools representing if input and output are -// complex -inline FFTTransformType GetFFTTransformType( - framework::proto::VarType::Type input_dtype, - framework::proto::VarType::Type output_dtype) { - auto complex_input = framework::IsComplexType(input_dtype); - auto complex_output = framework::IsComplexType(output_dtype); - if (complex_input && complex_output) { - return FFTTransformType::C2C; - } else if (complex_input && !complex_output) { - return FFTTransformType::C2R; - } else if (!complex_input && complex_output) { - return FFTTransformType::R2C; - } - PADDLE_THROW( - platform::errors::InvalidArgument("Real to real FFTs are not supported")); -} - -// Returns true if the transform type has complex input -inline bool has_complex_input(FFTTransformType type) { - switch (type) { - case FFTTransformType::C2C: - case FFTTransformType::C2R: - return true; - - case FFTTransformType::R2C: - return false; - } - PADDLE_THROW(platform::errors::InvalidArgument("Unknown FFTTransformType")); -} - -// Returns true if the transform type has complex output -inline bool has_complex_output(FFTTransformType type) { - switch (type) { - case FFTTransformType::C2C: - case FFTTransformType::R2C: - return true; - - case FFTTransformType::C2R: - return false; - } - PADDLE_THROW(platform::errors::InvalidArgument("Unknown FFTTransformType")); -} - -template -struct FFTFillConjGradFunctor { - T* input_; - const size_t axis_; - const int64_t* strides_; - const size_t double_length_; - - FFTFillConjGradFunctor(T* input, - size_t axis, - const int64_t* strides, - size_t double_length) - : input_(input), - axis_(axis), - strides_(strides), - double_length_(double_length) {} - - HOSTDEVICE void operator()(size_t index) { - size_t offtset = index; // back - size_t index_i; - for (size_t i = 0; i <= axis_; i++) { - index_i = offtset / strides_[i]; - offtset %= strides_[i]; - } - - if ((0 < index_i) && (index_i < double_length_ + 1)) { - input_[index] *= static_cast(2); - } - } -}; - -template -struct FFTC2CFunctor { - void operator()(const DeviceContext& ctx, - const Tensor* X, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward); -}; - -template -struct FFTR2CFunctor { - void operator()(const DeviceContext& ctx, - const Tensor* X, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward); -}; - -template -struct FFTC2RFunctor { - void operator()(const DeviceContext& ctx, - const Tensor* X, - Tensor* out, - const std::vector& axes, - FFTNormMode normalization, - bool forward); -}; - -// Giving a linear destination index and strides of tensor, get_idx return the -// corresponding linear position of source tensor. -// The linear index is the position of flatten tensor. -// Giving a linear destination index and strides of tensor, get_idx return the -// corresponding linear position of source tensor. -// The linear index is the position of flatten tensor. -HOSTDEVICE inline int64_t get_src_idx(const int64_t dst_idx, - const int64_t* dst_strides, - const int64_t* dst_shape, - const int64_t* src_strides, - const bool* is_fft_axis, - const bool conj, - const int64_t rank) { - int64_t src_idx = 0; - int64_t quotient = dst_idx; - int64_t remainder = 0; - - for (int64_t i = 0; i < rank; i++) { - remainder = quotient % dst_strides[i]; - quotient = quotient / dst_strides[i]; - if (conj && is_fft_axis[i]) { - src_idx += ((dst_shape[i] - quotient) % dst_shape[i]) * src_strides[i]; - } else { - src_idx += src_strides[i] * quotient; - } - quotient = remainder; - } - - return src_idx; -} - -HOSTDEVICE inline bool is_conj_part(const int64_t dst_idx, - const int64_t* dst_strides, - const int64_t last_axis, - const int64_t last_axis_size) { - int64_t quotient = dst_idx; - int64_t remainder = 0; - - for (int64_t i = 0; i < last_axis + 1; i++) { - remainder = quotient % dst_strides[i]; - quotient = quotient / dst_strides[i]; - - if ((i == last_axis) && (quotient > last_axis_size - 1)) { - return true; - } - - quotient = remainder; - } - - return false; -} - -// FFTFillConjFunctor fill the destination tensor with source tensor and -// conjugate symmetry element of source tensor . -// Use framework::ForRange to iterate destination element with -// supporting different device -template -struct FFTFillConjFunctor { - FFTFillConjFunctor(const C* src_data, - C* dst_data, - const int64_t* src_strides, - const int64_t* dst_strides, - const int64_t* dst_shape, - const bool* is_fft_axis, - const int64_t last_axis, - const int64_t last_axis_size, - const int64_t rank) - : src_data_(src_data), - dst_data_(dst_data), - src_strides_(src_strides), - dst_strides_(dst_strides), - dst_shape_(dst_shape), - is_fft_axis_(is_fft_axis), - last_axis_(last_axis), - last_axis_size_(last_axis_size), - rank_(rank) {} - HOSTDEVICE void operator()(int64_t dst_idx) { - if (is_conj_part(dst_idx, dst_strides_, last_axis_, last_axis_size_)) { - const auto conj_idx = get_src_idx(dst_idx, - dst_strides_, - dst_shape_, - src_strides_, - is_fft_axis_, - true, - rank_); - auto src_value = src_data_[conj_idx]; - auto conj_value = C(src_value.real, -src_value.imag); - dst_data_[dst_idx] = conj_value; - } else { - const auto copy_idx = get_src_idx(dst_idx, - dst_strides_, - dst_shape_, - src_strides_, - is_fft_axis_, - false, - rank_); - dst_data_[dst_idx] = src_data_[copy_idx]; - } - } - - const C* src_data_; - C* dst_data_; - const int64_t* src_strides_; - const int64_t* dst_strides_; - const int64_t* dst_shape_; - const bool* is_fft_axis_; - const int64_t last_axis_; - const int64_t last_axis_size_; - const int64_t rank_; -}; - -template -void fill_conj(const DeviceContext& ctx, - const Tensor* src, - Tensor* dst, - const std::vector& axes) { - std::vector src_strides_v = - phi::vectorize(phi::stride(src->dims())); - std::vector dst_strides_v = - phi::vectorize(phi::stride(dst->dims())); - std::vector dst_shape_v = phi::vectorize(dst->dims()); - const auto src_data = src->data(); - auto dst_data = dst->data(); - const auto last_axis = axes.back(); - const auto last_axis_size = dst->dims().at(last_axis) / 2 + 1; - const int64_t rank = dst->dims().size(); - auto _is_fft_axis = std::make_unique(rank); - for (const auto i : axes) { - _is_fft_axis[i] = true; - } - -#if defined(__NVCC__) || defined(__HIPCC__) - const thrust::device_vector src_strides_g(src_strides_v); - const auto src_strides = thrust::raw_pointer_cast(src_strides_g.data()); - const thrust::device_vector dst_strides_g(dst_strides_v); - const auto dst_strides = thrust::raw_pointer_cast(dst_strides_g.data()); - const thrust::device_vector dst_shape_g(dst_shape_v); - const auto dst_shape = thrust::raw_pointer_cast(dst_shape_g.data()); - const thrust::device_vector is_fft_axis_g(_is_fft_axis.get(), - _is_fft_axis.get() + rank); - const auto p_is_fft_axis = thrust::raw_pointer_cast(is_fft_axis_g.data()); -#else - const auto src_strides = src_strides_v.data(); - const auto dst_strides = dst_strides_v.data(); - const auto dst_shape = dst_shape_v.data(); - const auto p_is_fft_axis = _is_fft_axis.get(); -#endif - platform::ForRange for_range(ctx, dst->numel()); - FFTFillConjFunctor fill_conj_functor(src_data, - dst_data, - src_strides, - dst_strides, - dst_shape, - p_is_fft_axis, - last_axis, - last_axis_size, - rank); - for_range(fill_conj_functor); -} - -template -class FFTC2CKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using C = paddle::platform::complex; - auto& dev_ctx = ctx.device_context(); - - auto axes = ctx.Attr>("axes"); - const std::string& norm_str = ctx.Attr("normalization"); - const bool forward = ctx.Attr("forward"); - const auto* x = ctx.Input("X"); - auto* y = ctx.Output("Out"); - - y->mutable_data(ctx.GetPlace()); - auto normalization = get_norm_from_string(norm_str, forward); - - FFTC2CFunctor fft_c2c_func; - fft_c2c_func(dev_ctx, x, y, axes, normalization, forward); - } -}; - -template -class FFTC2CGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using C = paddle::platform::complex; - auto& dev_ctx = ctx.device_context(); - - auto axes = ctx.Attr>("axes"); - const std::string& norm_str = ctx.Attr("normalization"); - const bool forward = ctx.Attr("forward"); - const auto* dy = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - dx->mutable_data(ctx.GetPlace()); - auto normalization = get_norm_from_string(norm_str, forward); - - FFTC2CFunctor fft_c2c_func; - fft_c2c_func(dev_ctx, dy, dx, axes, normalization, !forward); - } -}; - -template -class FFTR2CKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using C = paddle::platform::complex; - auto& dev_ctx = ctx.device_context(); - - auto axes = ctx.Attr>("axes"); - const std::string& norm_str = ctx.Attr("normalization"); - const bool forward = ctx.Attr("forward"); - const bool onesided = ctx.Attr("onesided"); - const auto* x = ctx.Input("X"); - auto* y = ctx.Output("Out"); - - y->mutable_data(ctx.GetPlace()); - auto normalization = get_norm_from_string(norm_str, forward); - - FFTR2CFunctor fft_r2c_func; - - if (onesided) { - fft_r2c_func(dev_ctx, x, y, axes, normalization, forward); - } else { - framework::DDim onesided_dims(y->dims()); - const int64_t onesided_last_axis_size = y->dims().at(axes.back()) / 2 + 1; - onesided_dims.at(axes.back()) = onesided_last_axis_size; - framework::Tensor onesided_out; - onesided_out.mutable_data(onesided_dims, ctx.GetPlace()); - fft_r2c_func(dev_ctx, x, &onesided_out, axes, normalization, forward); - fill_conj(dev_ctx, &onesided_out, y, axes); - } - } -}; - -template -class FFTR2CGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using C = paddle::platform::complex; - auto& dev_ctx = ctx.device_context(); - - const auto axes = ctx.Attr>("axes"); - const std::string& norm_str = ctx.Attr("normalization"); - const bool forward = ctx.Attr("forward"); - const bool onesided = ctx.Attr("onesided"); - - const auto* dy = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - framework::Tensor complex_dx; - complex_dx.mutable_data(dx->dims(), ctx.GetPlace()); - - auto normalization = get_norm_from_string(norm_str, forward); - FFTC2CFunctor fft_c2c_func; - - if (!onesided) { - fft_c2c_func(dev_ctx, dy, &complex_dx, axes, normalization, !forward); - } else { - framework::Tensor full_dy; - full_dy.mutable_data(dx->dims(), ctx.GetPlace()); - auto zero_length = static_cast(full_dy.dims().at(axes.back()) - - dy->dims().at(axes.back())); - auto rank = dy->dims().size(); - - std::vector pads(rank * 2, 0); - pads[axes.back() * 2 + 1] = zero_length; - - phi::funcs::PaddingFunctor( - rank, - ctx.template device_context(), - pads, - static_cast(0), - *dy, - &full_dy); - fft_c2c_func( - dev_ctx, &full_dy, &complex_dx, axes, normalization, !forward); - } - framework::TransComplexToReal( - framework::TransToProtoVarType(dx->dtype()), - framework::TransToProtoVarType(complex_dx.dtype()), - complex_dx, - dx); - } -}; - -template -class FFTC2RKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using C = paddle::platform::complex; - auto& dev_ctx = ctx.device_context(); - - auto axes = ctx.Attr>("axes"); - const std::string& norm_str = ctx.Attr("normalization"); - const bool forward = ctx.Attr("forward"); - const auto* x = ctx.Input("X"); - auto* y = ctx.Output("Out"); - - y->mutable_data(ctx.GetPlace()); - auto normalization = get_norm_from_string(norm_str, forward); - - FFTC2RFunctor fft_c2r_func; - fft_c2r_func(dev_ctx, x, y, axes, normalization, forward); - } -}; - -template -class FFTC2RGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using C = paddle::platform::complex; - auto& dev_ctx = ctx.device_context(); - - auto axes = ctx.Attr>("axes"); - const std::string& norm_str = ctx.Attr("normalization"); - const bool forward = ctx.Attr("forward"); - const auto* dy = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - C* pdx = dx->mutable_data(ctx.GetPlace()); - auto normalization = get_norm_from_string(norm_str, forward); - - FFTR2CFunctor fft_r2c_func; - fft_r2c_func(dev_ctx, dy, dx, axes, normalization, !forward); - - const int64_t double_length = - dy->dims()[axes.back()] - dx->dims()[axes.back()]; - const framework::DDim strides = phi::stride(dx->dims()); - -#if defined(__NVCC__) || defined(__HIPCC__) - const thrust::device_vector strides_g(phi::vectorize(strides)); - const int64_t* pstrides = thrust::raw_pointer_cast(strides_g.data()); -#else - const int64_t* pstrides = strides.Get(); -#endif - - FFTFillConjGradFunctor func(pdx, axes.back(), pstrides, double_length); - size_t limit = dx->numel(); - platform::ForRange for_range(dev_ctx, limit); - for_range(func); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/stft_op.cc b/paddle/fluid/operators/stft_op.cc index d708abe3d0f8c..986911a139145 100644 --- a/paddle/fluid/operators/stft_op.cc +++ b/paddle/fluid/operators/stft_op.cc @@ -14,8 +14,6 @@ #include "paddle/fluid/operators/stft_op.h" -#include "paddle/fluid/operators/spectral_helper.h" - namespace paddle { namespace operators { class StftOp : public framework::OperatorWithKernel { diff --git a/paddle/fluid/operators/stft_op.cu b/paddle/fluid/operators/stft_op.cu index 9edee0f66c514..bd48112f0737e 100644 --- a/paddle/fluid/operators/stft_op.cu +++ b/paddle/fluid/operators/stft_op.cu @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/operators/stft_op.h" -#include "paddle/fluid/operators/spectral_op.cu.h" namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/stft_op.h b/paddle/fluid/operators/stft_op.h index bbd9b137699dc..fb2ca31608cd7 100644 --- a/paddle/fluid/operators/stft_op.h +++ b/paddle/fluid/operators/stft_op.h @@ -18,8 +18,11 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/operators/spectral_op.h" +#include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/funcs/fft.h" +#include "paddle/phi/kernels/funcs/fft_fill_conj.h" #include "paddle/phi/kernels/funcs/frame_functor.h" +#include "paddle/phi/kernels/funcs/padding.h" namespace paddle { namespace operators { @@ -76,25 +79,25 @@ class StftKernel : public framework::OpKernel { ctx, &frames, window, axes.back(), MulFunctor(), &frames_w); // FFTR2C - FFTNormMode normalization; + phi::funcs::FFTNormMode normalization; if (normalized) { - normalization = get_norm_from_string("ortho", true); + normalization = phi::funcs::get_norm_from_string("ortho", true); } else { - normalization = get_norm_from_string("backward", true); + normalization = phi::funcs::get_norm_from_string("backward", true); } - FFTR2CFunctor fft_r2c_func; + phi::funcs::FFTR2CFunctor fft_r2c_func; if (onesided) { - fft_r2c_func(dev_ctx, &frames_w, out, axes, normalization, true); + fft_r2c_func(dev_ctx, frames_w, out, axes, normalization, true); } else { framework::DDim onesided_dims(out->dims()); const int64_t onesided_axis_size = out->dims().at(axes.back()) / 2 + 1; onesided_dims.at(axes.back()) = onesided_axis_size; Tensor onesided_out; onesided_out.mutable_data(onesided_dims, ctx.GetPlace()); - fft_r2c_func( - dev_ctx, &frames_w, &onesided_out, axes, normalization, true); - fill_conj(dev_ctx, &onesided_out, out, axes); + fft_r2c_func(dev_ctx, frames_w, &onesided_out, axes, normalization, true); + phi::funcs::FFTFillConj( + dev_ctx, &onesided_out, out, axes); } } }; @@ -131,17 +134,17 @@ class StftGradKernel : public framework::OpKernel { complex_d_frames_w.mutable_data(d_frames_dims, ctx.GetPlace()); // dy -> d_frames_w - FFTNormMode normalization; + phi::funcs::FFTNormMode normalization; if (normalized) { - normalization = get_norm_from_string("ortho", true); + normalization = phi::funcs::get_norm_from_string("ortho", true); } else { - normalization = get_norm_from_string("backward", true); + normalization = phi::funcs::get_norm_from_string("backward", true); } - FFTC2CFunctor fft_c2c_func; + phi::funcs::FFTC2CFunctor fft_c2c_func; if (!onesided) { fft_c2c_func( - dev_ctx, dy, &complex_d_frames_w, axes, normalization, false); + dev_ctx, *dy, &complex_d_frames_w, axes, normalization, false); } else { Tensor full_dy; full_dy.mutable_data(d_frames_dims, ctx.GetPlace()); @@ -153,20 +156,11 @@ class StftGradKernel : public framework::OpKernel { pads[axes.back() * 2 + 1] = zero_length; phi::funcs::PaddingFunctor( - rank, - ctx.template device_context(), - pads, - static_cast(0), - *dy, - &full_dy); + rank, dev_ctx, pads, static_cast(0), *dy, &full_dy); fft_c2c_func( - dev_ctx, &full_dy, &complex_d_frames_w, axes, normalization, false); + dev_ctx, full_dy, &complex_d_frames_w, axes, normalization, false); } - framework::TransComplexToReal( - framework::TransToProtoVarType(d_frames_w.dtype()), - framework::TransToProtoVarType(complex_d_frames_w.dtype()), - complex_d_frames_w, - &d_frames_w); + phi::RealKernel(dev_ctx, complex_d_frames_w, &d_frames_w); // d_frames_w -> d_frames Tensor d_frames; diff --git a/paddle/fluid/operators/tdm_child_op.cc b/paddle/fluid/operators/tdm_child_op.cc index 2dab74838a0b4..1e98035039f85 100644 --- a/paddle/fluid/operators/tdm_child_op.cc +++ b/paddle/fluid/operators/tdm_child_op.cc @@ -34,7 +34,7 @@ class TDMChildOpMaker : public framework::OpProtoAndCheckerMaker { "information in the following format: item_id(shape=1), " "layer_id(shape=1), parent_id(shape=1), child_id(shape=child_nums)"); AddAttr("child_nums", - "child_nums(int)", + "child_nums(int)" "The child nums of one node, if the node hasn't enough child, " "it should padding 0 until child nums equal to child_nums"); AddOutput("Child", diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index 5ef047cd06914..b229c4aed79b2 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -34,7 +34,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { "engine_key", "The engine_key here is used to distinguish different TRT Engines"); AddAttr("max_batch_size", "the maximum batch size."); - AddAttr("workspace_size", "the workspace size."); + AddAttr("workspace_size", "the workspace size.").AsExtra(); AddAttr("sub_block", "the trt block"); AddAttr("enable_int8", "whether swith to int8 mode"); AddComment("TensorRT engine operator."); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index b13996b6fab78..9b05faf8df47a 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -177,7 +177,7 @@ class TensorRTEngineOp : public framework::OperatorBase { std::vector runtime_input_names_; mutable TensorRTEngine *trt_engine_{nullptr}; int max_batch_size_; - int workspace_size_; + int64_t workspace_size_; std::unique_ptr calibrator_; bool enable_int8_; bool enable_fp16_; @@ -207,7 +207,7 @@ class TensorRTEngineOp : public framework::OperatorBase { : framework::OperatorBase(type, inputs, outputs, attrs) { input_names_ = Inputs("Xs"); max_batch_size_ = Attr("max_batch_size"); - workspace_size_ = Attr("workspace_size"); + workspace_size_ = Attr("workspace_size"); device_id_ = Attr("gpu_id"); enable_int8_ = Attr("enable_int8"); enable_fp16_ = Attr("enable_fp16"); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 33ebaff8eabad..7b58a1bb7d6d2 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -107,7 +107,7 @@ void DynamicShapeTest(bool allow_build_at_runtime) { engine_op_desc.SetBlockAttr("sub_block", &block_desc); engine_op_desc.SetAttr("max_batch_size", static_cast(2)); - engine_op_desc.SetAttr("workspace_size", static_cast(1 << 20)); + engine_op_desc.SetAttr("workspace_size", static_cast(1 << 20)); engine_op_desc.SetAttr("parameters", std::vector({})); engine_op_desc.SetAttr("engine_key", std::string("a_engine")); engine_op_desc.SetAttr("calibration_engine_key", @@ -259,7 +259,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetBlockAttr("sub_block", &block_desc); engine_op_desc.SetAttr("max_batch_size", static_cast(batch_size)); - engine_op_desc.SetAttr("workspace_size", static_cast(1 << 20)); + engine_op_desc.SetAttr("workspace_size", static_cast(1 << 20)); engine_op_desc.SetAttr("parameters", std::vector({"y0", "y1", "y2", "y3"})); engine_op_desc.SetAttr("engine_key", std::string("b_engine")); diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc index 1215bf2de3c67..1d5b57a8a3d53 100644 --- a/paddle/fluid/operators/tile_op.cc +++ b/paddle/fluid/operators/tile_op.cc @@ -114,42 +114,6 @@ class TileGradOp : public framework::OperatorWithKernel { "TileGrad"); auto x_dims = ctx->GetInputDim("X"); - - std::vector repeat_times = - ctx->Attrs().Get>("repeat_times"); - if (repeat_times.size() == 0) { - repeat_times = std::vector(x_dims.size(), -1); - } - - auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); - auto x_dim_vec = phi::vectorize(x_dims); - if (x_dim_vec.size() > repeat_times.size()) { - auto diff = x_dim_vec.size() - repeat_times.size(); - repeat_times.insert(repeat_times.begin(), diff, -1); - } else { - auto diff = repeat_times.size() - x_dim_vec.size(); - x_dim_vec.insert(x_dim_vec.begin(), diff, -1); - } - - for (size_t i = 0; i < repeat_times.size(); ++i) { - if (repeat_times[i] == -1 || x_dim_vec[i] == -1) { - continue; - } else { - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - x_dim_vec[i] * repeat_times[i], - out_dims[i], - platform::errors::InvalidArgument( - "The size (%d) of the dimension %d of Input(Out@GRAD) should " - "be equal to the multiplication of the crroresponding " - "dimension size of Input(X) (%d) and repeat_times (%d).", - out_dims[i], - i, - x_dim_vec[i], - repeat_times[i])); - } - } - } auto x_grad_name = framework::GradVarName("X"); if (ctx->HasOutput(x_grad_name)) { diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index 4722a1fc9cc8a..610d6e1f48aad 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -80,7 +80,7 @@ class TransposeOp : public framework::OperatorWithKernel { #ifdef PADDLE_WITH_MKLDNN // Here we need to match dims to paddle layout // as we are producing non-oneDNN result - if ((x_dims.size() >= 3) && + if (ctx->IsRunMKLDNNKernel() && (x_dims.size() >= 3) && (paddle::platform::MKLDNNDeviceContext::tls() .get_cur_paddle_data_layout() == framework::DataLayout::kNHWC)) { auto dims = phi::vectorize(x_dims); diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc index 47679ca57f5bf..b1a0d68c12e7b 100644 --- a/paddle/fluid/operators/unpool_op.cc +++ b/paddle/fluid/operators/unpool_op.cc @@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/unpool_op.h" - #include #include #include + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -152,45 +156,6 @@ class UnpoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Unpool"); - OP_INOUT_CHECK(ctx->HasInput("Indices"), "Input", "Indices", "Unpool"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Unpool"); - auto in_x_dims = ctx->GetInputDim("X"); - auto in_y_dims = ctx->GetInputDim("Indices"); - std::string unpooling_type = - ctx->Attrs().Get("unpooling_type"); - std::vector ksize = ctx->Attrs().Get>("ksize"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - std::vector output_size = - ctx->Attrs().Get>("output_size"); - PADDLE_ENFORCE_EQ(in_x_dims.size() == 4, - true, - platform::errors::InvalidArgument( - "Unpool Intput(X) must be of 4-dimensional, but " - "received Input(X)'s dimensions is %d.", - in_x_dims.size())); - PADDLE_ENFORCE_EQ(in_x_dims, - in_y_dims, - platform::errors::InvalidArgument( - "The dimensions of Input(X) must equal to be" - "the dimensions of Input(Indices), but received" - "dimensions of Input(X) is [%d], received dimensions" - "of Input(Indices) is [%d]", - in_x_dims, - in_y_dims)); - - std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - for (size_t i = 0; i < ksize.size(); ++i) { - if (!ctx->IsRuntime() && in_x_dims[i + 2] <= 0) { - output_shape.push_back(-1); - } else { - output_shape.push_back(output_size[i]); - } - } - ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); - } }; class Unpool3dOp : public framework::OperatorWithKernel { @@ -204,45 +169,6 @@ class Unpool3dOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Unpool3d"); - OP_INOUT_CHECK(ctx->HasInput("Indices"), "Input", "Indices", "Unpool3d"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Unpool3d"); - auto in_x_dims = ctx->GetInputDim("X"); - auto in_y_dims = ctx->GetInputDim("Indices"); - std::string unpooling_type = - ctx->Attrs().Get("unpooling_type"); - std::vector ksize = ctx->Attrs().Get>("ksize"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - std::vector output_size = - ctx->Attrs().Get>("output_size"); - PADDLE_ENFORCE_EQ(in_x_dims.size() == 5, - true, - platform::errors::InvalidArgument( - "Unpool Intput(X) must be of 5-dimensional, but " - "received Input(X)'s dimensions is %d.", - in_x_dims.size())); - PADDLE_ENFORCE_EQ(in_x_dims, - in_y_dims, - platform::errors::InvalidArgument( - "The dimensions of Input(X) must equal to be" - "the dimensions of Input(Indices), but received" - "dimensions of Input(X) is [%d], received dimensions" - "of Input(Indices) is [%d]", - in_x_dims, - in_y_dims)); - - std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - for (size_t i = 0; i < ksize.size(); ++i) { - if (!ctx->IsRuntime() && in_x_dims[i + 2] <= 0) { - output_shape.push_back(-1); - } else { - output_shape.push_back(output_size[i]); - } - } - ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); - } }; template @@ -286,14 +212,6 @@ class UnpoolOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "UnpoolGrad"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), - "Output", - framework::GradVarName("X"), - "UnpoolGrad"); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } }; class Unpool3dOpGrad : public framework::OperatorWithKernel { @@ -307,44 +225,43 @@ class Unpool3dOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Unpool3dGrad"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), - "Output", - framework::GradVarName("X"), - "Unpool3dGrad"); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(unpool, + UnpoolInferShapeFunctor, + PD_INFER_META(phi::UnpoolInferMeta)); REGISTER_OPERATOR(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, ops::UnpoolOpGradMaker, - ops::UnpoolOpGradMaker); + ops::UnpoolOpGradMaker, + UnpoolInferShapeFunctor); -REGISTER_OPERATOR(unpool_grad, ops::UnpoolOpGrad); -REGISTER_OP_CPU_KERNEL(unpool, - ops::UnpoolKernel, - ops::UnpoolKernel); -REGISTER_OP_CPU_KERNEL(unpool_grad, - ops::UnpoolGradKernel, - ops::UnpoolGradKernel); +DECLARE_INFER_SHAPE_FUNCTOR(unpool_grad, + UnpoolGradInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); + +REGISTER_OPERATOR(unpool_grad, ops::UnpoolOpGrad, UnpoolGradInferShapeFunctor); + +DECLARE_INFER_SHAPE_FUNCTOR(unpool, + Unpool3dInferShapeFunctor, + PD_INFER_META(phi::Unpool3dInferMeta)); REGISTER_OPERATOR(unpool3d, ops::Unpool3dOp, ops::Unpool3dOpMaker, ops::Unpool3dOpGradMaker, - ops::Unpool3dOpGradMaker); + ops::Unpool3dOpGradMaker, + Unpool3dInferShapeFunctor); + +DECLARE_INFER_SHAPE_FUNCTOR(unpool3d_grad, + Unpool3dGradInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); -REGISTER_OPERATOR(unpool3d_grad, ops::Unpool3dOpGrad); -REGISTER_OP_CPU_KERNEL(unpool3d, - ops::Unpool3dKernel, - ops::Unpool3dKernel); -REGISTER_OP_CPU_KERNEL(unpool3d_grad, - ops::Unpool3dGradKernel, - ops::Unpool3dGradKernel); +REGISTER_OPERATOR(unpool3d_grad, + ops::Unpool3dOpGrad, + Unpool3dGradInferShapeFunctor); diff --git a/paddle/fluid/operators/unpool_op.cu.cc b/paddle/fluid/operators/unpool_op.cu.cc deleted file mode 100644 index 82890419dafa5..0000000000000 --- a/paddle/fluid/operators/unpool_op.cu.cc +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -Indicesou may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/unpool_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(unpool, - ops::UnpoolKernel, - ops::UnpoolKernel); -REGISTER_OP_CUDA_KERNEL(unpool_grad, - ops::UnpoolGradKernel, - ops::UnpoolGradKernel); -REGISTER_OP_CUDA_KERNEL(unpool3d, - ops::Unpool3dKernel, - ops::Unpool3dKernel); -REGISTER_OP_CUDA_KERNEL(unpool3d_grad, - ops::Unpool3dGradKernel, - ops::Unpool3dGradKernel); diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h deleted file mode 100644 index 062008f95ea3c..0000000000000 --- a/paddle/fluid/operators/unpool_op.h +++ /dev/null @@ -1,123 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -Indicesou may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/unpooling.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { -template -class UnpoolKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const framework::Tensor* in_x = context.Input("X"); - const framework::Tensor* in_y = context.Input("Indices"); - auto* out = context.Output("Out"); - std::string unpooling_type = context.Attr("unpooling_type"); - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - T* output_data = out->mutable_data(context.GetPlace()); - auto& dev_ctx = context.template device_context(); - if (output_data) { - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, out, static_cast(0)); - } - math::Unpool2dMaxFunctor unpool2d_max_forward; - unpool2d_max_forward(dev_ctx, *in_x, *in_y, out); - } -}; -template -class UnpoolGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const framework::Tensor* in_x = context.Input("X"); - const framework::Tensor* in_y = context.Input("Indices"); - const framework::Tensor* out = context.Input("Out"); - const framework::Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - framework::Tensor* in_x_grad = - context.Output(framework::GradVarName("X")); - std::string unpooling_type = context.Attr("unpooling_type"); - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - - auto& device_ctx = context.template device_context(); - phi::funcs::SetConstant zero; - - in_x_grad->mutable_data(context.GetPlace()); - zero(device_ctx, in_x_grad, static_cast(0)); - - math::Unpool2dMaxGradFunctor unpool2d_max_backward; - unpool2d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad); - } -}; - -template -class Unpool3dKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const framework::Tensor* in_x = context.Input("X"); - const framework::Tensor* in_y = context.Input("Indices"); - auto* out = context.Output("Out"); - std::string unpooling_type = context.Attr("unpooling_type"); - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - T* output_data = out->mutable_data(context.GetPlace()); - auto& dev_ctx = context.template device_context(); - if (output_data) { - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, out, static_cast(0)); - } - math::Unpool3dMaxFunctor unpool3d_max_forward; - unpool3d_max_forward(dev_ctx, *in_x, *in_y, out); - } -}; - -template -class Unpool3dGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const framework::Tensor* in_x = context.Input("X"); - const framework::Tensor* in_y = context.Input("Indices"); - const framework::Tensor* out = context.Input("Out"); - const framework::Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - framework::Tensor* in_x_grad = - context.Output(framework::GradVarName("X")); - std::string unpooling_type = context.Attr("unpooling_type"); - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - - auto& device_ctx = context.template device_context(); - phi::funcs::SetConstant zero; - - in_x_grad->mutable_data(context.GetPlace()); - zero(device_ctx, in_x_grad, static_cast(0)); - - math::Unpool3dMaxGradFunctor unpool3d_max_backward; - unpool3d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad); - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 75331dbbb2602..6ed27fd9b326d 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -265,15 +265,6 @@ cc_library( set(DEVICE_EVENT_LIBS device_event_base CACHE INTERNAL "device event libs") -if(WITH_XPU) - cc_library( - device_event_xpu - SRCS device_event_xpu.cc - DEPS device_event_base xpu_info) - set(DEVICE_EVENT_LIBS - device_event_xpu - CACHE INTERNAL "device event libs") -endif() if(WITH_ASCEND_CL) cc_library( diff --git a/paddle/fluid/platform/device/xpu/enforce_xpu.h b/paddle/fluid/platform/device/xpu/enforce_xpu.h index 77d14aa712e70..4d88d7a96246d 100644 --- a/paddle/fluid/platform/device/xpu/enforce_xpu.h +++ b/paddle/fluid/platform/device/xpu/enforce_xpu.h @@ -25,10 +25,6 @@ inline const char* xpuGetErrorString(int stat) { return phi::backends::xpu::xpuGetErrorString(stat); } -inline const char* bkclGetErrorString(BKCLResult_t stat) { - return phi::backends::xpu::bkclGetErrorString(stat); -} - inline const char* xdnnGetErrorString(int stat) { return phi::backends::xpu::xdnnGetErrorString(stat); } @@ -37,10 +33,16 @@ inline std::string build_xpu_error_msg(int stat) { return phi::backends::xpu::build_xpu_error_msg(stat); } +#ifdef PADDLE_WITH_XPU_BKCL inline std::string build_xpu_error_msg(BKCLResult_t stat) { return phi::backends::xpu::build_xpu_error_msg(stat); } +inline const char* bkclGetErrorString(BKCLResult_t stat) { + return phi::backends::xpu::bkclGetErrorString(stat); +} +#endif + inline std::string build_xpu_xdnn_error_msg(int stat, std::string msg) { return phi::backends::xpu::build_xpu_xdnn_error_msg(stat, msg); } diff --git a/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc b/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc index 0b528c3999e07..b2fd80530feb7 100644 --- a/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc +++ b/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc @@ -124,6 +124,7 @@ TEST(enforce, xpu_status) { "Execution interrupted by user")); } +#ifdef PADDLE_WITH_XPU_BKCL TEST(enforce, bkcl_status) { EXPECT_TRUE(CheckXPUStatusSuccess(BKCL_SUCCESS)); EXPECT_TRUE( @@ -133,6 +134,7 @@ TEST(enforce, bkcl_status) { EXPECT_TRUE( CheckXPUStatusFailure(BKCL_INTERNAL_ERROR, "BKCL_INTERNAL_ERROR")); } +#endif TEST(enforce, xdnn_status) { EXPECT_TRUE(CheckXDNNStatusSuccess(xpu::Error_t::SUCCESS)); diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index c36dd6425c899..5aa5ec260db1f 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -151,6 +151,16 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace())})}, + {"empty", + XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT16, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP64, XPUPlace())})}, {"equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), @@ -312,6 +322,9 @@ XPUOpMap& get_kl2_ops() { {"mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, + {"merged_momentum", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"momentum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), @@ -465,6 +478,22 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::INT8, XPUPlace()), pOpKernelType(vartype::UINT8, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, + {"squeeze_grad", + XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"squeeze", + XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), @@ -531,6 +560,22 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::UINT8, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, + {"unsqueeze_grad", + XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"unsqueeze", + XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"where_index", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()), diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 4bb1e3abf8a58..4b8833f9a6cd6 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -113,8 +113,8 @@ bool AllowTF32Cudnn(); enum DeviceType { CPU = 0, CUDA = 1, - XPU = 2, - NPU = 3, + NPU = 2, + XPU = 3, IPU = 4, MLU = 5, diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h index 2edccfa90c939..cf80266050af2 100644 --- a/paddle/fluid/platform/device_event.h +++ b/paddle/fluid/platform/device_event.h @@ -37,12 +37,6 @@ USE_EVENT_WAIT(kCUDA, kCUDA) USE_EVENT_WAIT(kCPU, kCUDA) #endif -#ifdef PADDLE_WITH_XPU -USE_EVENT(kXPU); -USE_EVENT_WAIT(kXPU, kXPU) -USE_EVENT_WAIT(kCPU, kXPU) -#endif - #ifdef PADDLE_WITH_ASCEND_CL USE_EVENT(kNPU); USE_EVENT_WAIT(kNPU, kNPU) diff --git a/paddle/fluid/platform/device_event_base.h b/paddle/fluid/platform/device_event_base.h index a2d3fc1dc3818..6a2948480b549 100644 --- a/paddle/fluid/platform/device_event_base.h +++ b/paddle/fluid/platform/device_event_base.h @@ -64,9 +64,9 @@ class DeviceEvent { "Required type < %d, but received type = %d", MaxDeviceTypes, type_id_)); - // TODO(Aurelius84): only support CPU/CUDA/XPU/NPU. + // TODO(Aurelius84): only support CPU/CUDA/NPU. PADDLE_ENFORCE_LT(type_id_, - 4, + 3, platform::errors::Unavailable( "Currently DeviceEvent do not support %s", place)); PADDLE_ENFORCE_NOT_NULL( diff --git a/paddle/fluid/platform/device_event_xpu.cc b/paddle/fluid/platform/device_event_xpu.cc deleted file mode 100644 index 098b0a56459a6..0000000000000 --- a/paddle/fluid/platform/device_event_xpu.cc +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/platform/device/xpu/xpu_info.h" -#include "paddle/fluid/platform/device_event_base.h" - -#ifdef PADDLE_WITH_XPU -namespace paddle { -namespace platform { - -struct XPUDeviceEventWrapper { - explicit XPUDeviceEventWrapper(const platform::Place& place) { - PADDLE_ENFORCE_EQ( - platform::is_xpu_place(place), - true, - platform::errors::PreconditionNotMet( - "Required device shall be XPUPlace, but received %d. ", place)); - - device_id_ = place.device; - PADDLE_ENFORCE_GT( - device_id_, - -1, - platform::errors::PreconditionNotMet( - "Required DeviceOption.device_id > -1, but received %d. ", - device_id_)); - xpu_event_create(&handle_); - } - - xpuEventHandle handle_; - int device_id_; -}; - -void DeviceEventCreateXPU(DeviceEvent* event, - const platform::Place& place, - unsigned int) { - event->InitEvent(std::make_shared(place)); -} - -void DeviceEventRecordXPU(DeviceEvent* event, const DeviceContext* context) { - auto* wrapper = static_cast(event->GetEvent().get()); - PADDLE_ENFORCE_NOT_NULL( - wrapper, - platform::errors::PreconditionNotMet( - "Failed to dynamic_cast event into XPUDeviceEventWrapper.")); - - auto* xpu_dev_ctx = dynamic_cast(context); - PADDLE_ENFORCE_NOT_NULL( - xpu_dev_ctx, - platform::errors::PreconditionNotMet( - "Failed to dynamic_cast context into XPUDeviceContext.")); - xpu_event_record(wrapper->handle_, xpu_dev_ctx->stream()); -} - -void DeviceEventFinishXPU(const DeviceEvent* event) { - auto* wrapper = static_cast(event->GetEvent().get()); - PADDLE_ENFORCE_NOT_NULL( - wrapper, - platform::errors::PreconditionNotMet( - "Failed to dynamic_cast event into XPUDeviceEventWrapper.")); - xpu_event_wait(wrapper->handle_); -} - -// current xpu not support query, used wait to instead. -bool DeviceEventQueryXPU(const DeviceEvent* event) { - DeviceEventFinishXPU(event); - return true; -} - -void DeviceEventXPUWaitXPU(const DeviceEvent* event, - const DeviceContext* context) { - auto* wrapper = static_cast(event->GetEvent().get()); - PADDLE_ENFORCE_NOT_NULL( - wrapper, - platform::errors::PreconditionNotMet( - "Failed to dynamic_cast event into XPUDeviceEventWrapper.")); - auto* xpu_dev_ctx = dynamic_cast(context); - PADDLE_ENFORCE_NOT_NULL( - xpu_dev_ctx, - platform::errors::PreconditionNotMet( - "Failed to dynamic_cast context into XOUDeviceContext.")); - xpu_stream_wait_event(xpu_dev_ctx->stream(), wrapper->handle_); -} - -void DeviceEventCPUWaitXPU(const DeviceEvent* event, - const DeviceContext* context) { - DeviceEventFinishXPU(event); -} - -void DeviceEventSetFinishedXPU(const DeviceEvent* event) { - // do nothing -} - -void EventResetXPU(const DeviceEvent* event) { - // do nothing -} - -} // namespace platform -} // namespace paddle - -using ::paddle::platform::kCPU; -using ::paddle::platform::kXPU; -REGISTER_EVENT_CREATE_FUNCTION(kXPU, paddle::platform::DeviceEventCreateXPU) -REGISTER_EVENT_RECORD_FUNCTION(kXPU, paddle::platform::DeviceEventRecordXPU) -REGISTER_EVENT_QUERY_FUNCTION(kXPU, paddle::platform::DeviceEventQueryXPU) -REGISTER_EVENT_FINISH_FUNCTION(kXPU, paddle::platform::DeviceEventFinishXPU) -REGISTER_EVENT_SET_FINISHED_FUNCTION( - kXPU, paddle::platform::DeviceEventSetFinishedXPU) -REGISTER_EVENT_WAIT_FUNCTION(kXPU, - kXPU, - paddle::platform::DeviceEventXPUWaitXPU) -REGISTER_EVENT_WAIT_FUNCTION(kCPU, - kXPU, - paddle::platform::DeviceEventCPUWaitXPU) -REGISTER_EVENT_RESET_FUNCTION(kXPU, paddle::platform::EventResetXPU) -#endif diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h index f6c7bb0435365..f1d7bad90f232 100644 --- a/paddle/fluid/platform/event.h +++ b/paddle/fluid/platform/event.h @@ -141,6 +141,7 @@ class CudaEvent { #else cudaEventCreateWithFlags(&event_, flags_); #endif + VLOG(4) << "CudaEvent " << event_; } explicit CudaEvent(unsigned int flags) : flags_(flags) { @@ -149,6 +150,7 @@ class CudaEvent { #else cudaEventCreateWithFlags(&event_, flags_); #endif + VLOG(4) << "CudaEvent " << event_; } ~CudaEvent() { diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 7a30b85181131..b16489e940d13 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -1006,8 +1006,8 @@ PADDLE_DEFINE_EXPORTED_bool( * default=PE * Example: * Note: - * FLAGS_jit_engine_type == Executor, using ExecutorFunction by default - * FLAGS_jit_engine_type == PE, using PEFunction by default + * FLAGS_jit_engine_type == Executor, using ExecutorEngine by default + * FLAGS_jit_engine_type == PE, using PEEngine by default */ PADDLE_DEFINE_EXPORTED_string(jit_engine_type, "PE", diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 85b0775c751dc..a88dc2a24863b 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/operators/pool_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/kernels/funcs/onednn/mkldnn_reuse.h" namespace paddle { namespace platform { @@ -38,216 +39,8 @@ template -class MKLDNNHandlerNoCachingT { - public: - MKLDNNHandlerNoCachingT(dnnl::engine engine, platform::Place cpu_place) - : engine_(engine), place_(cpu_place), fwd_pd_(nullptr), bwd_pd_(nullptr) { - platform::MKLDNNDeviceContext::tls().log_lib_version(); - } - - std::shared_ptr AcquireForwardPrimitive() { - return std::make_shared(*fwd_pd_); - } - - std::shared_ptr AcquireBackwardPrimitive() { - return std::make_shared(*bwd_pd_); - } - - std::shared_ptr AcquireBackwardWeightsPrimitive() { - PADDLE_ENFORCE_NOT_NULL( - bwd_w_pd_, - platform::errors::Unavailable("BWD_PD should be set when " - "getting BWD prim .")); - return std::make_shared(*bwd_w_pd_); - } - - std::shared_ptr AcquireSrcMemory( - const framework::Tensor* input) { - const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive(fwd_pd_->src_desc(), - to_void_cast(input_data)); - } - - template - std::shared_ptr AcquireDstMemory(framework::Tensor* output) { - T_out* ptr = - output->mutable_data(place_, fwd_pd_->dst_desc().get_size()); - return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr); - } - - template - std::shared_ptr AcquireDstMemory(void) { - return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc()); - } - - template - std::shared_ptr AcquireDstMemory( - const framework::Tensor* output) { - const T_out* output_data = output->data(); - return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(), - to_void_cast(output_data)); - } - - std::shared_ptr AcquireDiffDstMemory( - const framework::Tensor* diffdst) { - const T* ptr = diffdst->data(); - return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_dst_desc(), - to_void_cast(ptr)); - } - - std::shared_ptr AcquireDiffSrcMemory( - framework::Tensor* diffsrc) { - T* ptr = - diffsrc->mutable_data(place_, bwd_pd_->diff_src_desc().get_size()); - return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_desc(), ptr); - } - - // Buffer of given Tensor is used for oneDNN computation - std::shared_ptr AcquireDiffWeightsMemory( - framework::Tensor* diff_weights) { - PADDLE_ENFORCE_NOT_NULL( - bwd_w_pd_, - platform::errors::Unavailable( - "BWD_W_PD should be set when getting BWD grad of weights.")); - T* ptr = diff_weights->mutable_data( - place_, bwd_w_pd_->diff_weights_desc().get_size()); - return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), - ptr); - } - - // Buffer is allocated by oneDNN to store computation results - std::shared_ptr AcquireDiffWeightsMemory(void) { - PADDLE_ENFORCE_NOT_NULL( - bwd_w_pd_, - platform::errors::Unavailable( - "BWD_W_PD should be set when getting BWD grad of weights.")); - return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc()); - } - - protected: - // If your primitive descriptor requires attributes, pass them as a - // first argument and paramters to descriptor constructor in the following - // arguments. Otherwise, all arguments will be forwarded to descriptor - // constructor, including the first one. - template - void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) { - CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); - } - - // Using sfinae to specialise variadic function. Workaround for not having - // if constexpr in C++ 11. - template - typename std::enable_if::type, - dnnl::primitive_attr>::value>::type - CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { - auto fwd_desc = typename TForward::desc(std::forward(args)...); - fwd_pd_ = std::make_shared( - fwd_desc, first, engine_); - } - - template - typename std::enable_if::type, - dnnl::primitive_attr>::value>::type - CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { - auto fwd_desc = typename TForward::desc(std::forward(first), - std::forward(args)...); - fwd_pd_ = - std::make_shared(fwd_desc, engine_); - } - - template - void AcquireBackwardPrimitiveDescriptor(Args&&... args) { - // fwd_pd_ is set during grad by calling - // AcquireForwardPrimitiveDescriptor - PADDLE_ENFORCE_NOT_NULL(fwd_pd_, - platform::errors::Unavailable( - "Get MKLDNN Forward primitive %s failed.")); - auto bwd_desc = typename TBackward::desc(std::forward(args)...); - bwd_pd_ = std::make_shared( - bwd_desc, engine_, *fwd_pd_); - } - - template - void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) { - // fwd_pd_ is set during grad by calling - // AcquireForwardPrimitiveDescriptor - PADDLE_ENFORCE_NOT_NULL(fwd_pd_, - platform::errors::Unavailable( - "Get MKLDNN Forward primitive %s failed.")); - auto bwd_desc = - typename TBackward_params::desc(std::forward(args)...); - bwd_w_pd_ = std::make_shared( - bwd_desc, engine_, *fwd_pd_); - } - - std::shared_ptr AcquireMemoryFromPrimitive( - dnnl::memory::desc md, void* ptr) { - return std::make_shared(md, engine_, ptr); - } - - std::shared_ptr AcquireMemoryFromPrimitive( - dnnl::memory::desc md) { - return std::make_shared(md, engine_); - } - - void AcquireReorder(const std::shared_ptr& user_memory_p, - const std::shared_ptr& target_memory_p) { - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - platform::RecordEvent record_reorder("int_reorder", - platform::TracerEventType::UserDefined, - 2, - platform::EventRole::kUniqueOp); - reorder_p->execute( - astream, - {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); - astream.wait(); - } - - template - std::shared_ptr AcquireMemoryWithReorder( - const dnnl::memory::desc& user_md, - const dnnl::memory::desc& target_md, - void* ptr, - bool is_persistent = false, - std::function(const F*)> custom_reorder_func = {}) { - std::shared_ptr target_memory_p; - if (custom_reorder_func) { - auto reordered_data = - custom_reorder_func(reinterpret_cast(ptr)); - ptr = reinterpret_cast(reordered_data.get()); - } - auto user_memory_p = std::make_shared(user_md, engine_, ptr); - if (user_md != target_md) { - target_memory_p = std::make_shared(target_md, engine_); - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - platform::RecordEvent record_reorder( - "int_reorder", - platform::TracerEventType::UserDefined, - 2, - platform::EventRole::kUniqueOp); - reorder_p->execute( - astream, - {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); - astream.wait(); - } else { - target_memory_p = user_memory_p; - } - return target_memory_p; - } - - dnnl::engine engine_; - platform::Place place_; - std::shared_ptr fwd_pd_; - std::shared_ptr bwd_pd_; - std::shared_ptr bwd_w_pd_; -}; +using MKLDNNHandlerNoCachingT = phi::funcs:: + MKLDNNHandlerNoCachingT; template +constexpr bool IsInt8() { + return std::is_same::value || std::is_same::value; +} + +template +constexpr bool IsBfloat16() { + return std::is_same::value; +} + +template class MatMulV2MKLDNNHandler - : public paddle::platform::MKLDNNHandlerNoCachingT { + : public paddle::platform::MKLDNNHandlerNoCachingT { public: MatMulV2MKLDNNHandler(const framework::ExecutionContext& ctx, const dnnl::engine engine, @@ -873,8 +676,8 @@ class MatMulV2MKLDNNHandler bool is_output_fused, const std::vector& x_strides_override, const std::vector& y_strides_override) - : paddle::platform::MKLDNNHandlerNoCachingT(engine, - cpu_place) { + : paddle::platform::MKLDNNHandlerNoCachingT(engine, + cpu_place) { // M X K * K X N std::vector x_dims(x_org_dims); std::vector y_dims(y_org_dims); @@ -934,28 +737,52 @@ class MatMulV2MKLDNNHandler out_strides[i] = out_ddims[i + 1] * out_strides[i + 1]; } - if (is_output_fused) { + if (!IsInt8() && !IsBfloat16() && is_output_fused) { out_strides = FakeTransposeStrides(out_ddims); } - auto x_md = memory::desc(x_dims, MKLDNNGetDataType(), x_strides); - auto y_md = memory::desc(y_dims, MKLDNNGetDataType(), y_strides); - auto out_md = memory::desc(out_ddims, MKLDNNGetDataType(), out_strides); + auto x_md = memory::desc(x_dims, MKLDNNGetDataType(), x_strides); + auto y_md = memory::desc(y_dims, MKLDNNGetDataType(), y_strides); + auto out_md = memory::desc(out_ddims, MKLDNNGetDataType(), out_strides); const dnnl::primitive_attr matmul_attrs = CreateMatmulAttrs(ctx); this->AcquireForwardPrimitiveDescriptor(matmul_attrs, x_md, y_md, out_md); } - // TODO(jczaja) : Adapt to int8 + float ComputeOutputScale(const framework::ExecutionContext& ctx) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 1.0f; + if (ctx.HasAttr("Scale_x") && ctx.HasAttr("Scale_y") && + ctx.HasAttr("Scale_out")) { + float scale_x = ctx.Attr("Scale_x"); + float scale_y = ctx.Attr("Scale_y"); + bool force_fp32_out = ctx.HasAttr("force_fp32_output") + ? ctx.Attr("force_fp32_output") + : false; + float scale_out = force_fp32_out ? 1.f : ctx.Attr("Scale_out"); + alpha *= scale_out / (scale_x * scale_y); + } + return alpha; + } + dnnl::primitive_attr CreateMatmulAttrs( const framework::ExecutionContext& ctx) { dnnl::primitive_attr matmul_attrs; dnnl::post_ops post_operations; - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 1.0f; - if (alpha != 1.0f) { - matmul_attrs.set_output_scales(0, {alpha}); + float scale_out = ComputeOutputScale(ctx); + if (scale_out != 1.0f) { + matmul_attrs.set_output_scales(0, {scale_out}); + } + + if (ctx.HasInput("ResidualData")) { + auto* residual_data = ctx.Input("ResidualData"); + auto residual_data_tz = phi::vectorize(residual_data->dims()); + auto residual_data_md = memory::desc(residual_data_tz, + dnnl::memory::data_type::f32, + dnnl::memory::format_tag::abcd); + post_operations.append_binary(dnnl::algorithm::binary_add, + residual_data_md); } AppendActivation(ctx, post_operations); @@ -983,9 +810,23 @@ class MatMulV2MKLDNNHandler } std::shared_ptr AcquireWeightsMemory(const Tensor* input) { - const T* input_data = input->data(); + const YT* input_data = input->data(); return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(), - to_void_cast(input_data)); + to_void_cast(input_data)); + } + + std::shared_ptr AcquireDstMemory( + paddle::framework::Tensor* output) { + // We cannot use base AcquireDstMemory as it makes an allocation request + // base on DST memory primitive size. This is fine in general, but in MatMul + // we have primitive that covers only one batch of Data and then shift + // pointer for every new batch. Hence Tensor size is bigger that dst memory + // primitive size. So would we request less memory that is there and it + // triggers an + // assertion. So as there is no 'any' format here we can leave default size + // of Tensor as computed in ComputeInferShape + OT* ptr = output->mutable_data(this->place_); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr); } }; @@ -1089,11 +930,11 @@ class ActivationMKLDNNHandler static std::unordered_map GetAttributeMap( std::string act_type) { std::unordered_map attr_map; - if (act_type == "swish") + if (act_type == "swish") { attr_map.emplace("beta", "fuse_alpha"); - else if (act_type == "relu6") + } else if (act_type == "relu6") { attr_map.emplace("threshold", "fuse_alpha"); - else if (act_type == "hard_sigmoid") { + } else if (act_type == "hard_sigmoid") { attr_map.emplace("slope", "fuse_alpha"); attr_map.emplace("offset", "fuse_beta"); } else if (act_type == "clip") { diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index 61bdbdafa812c..9596551136c20 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -333,8 +333,7 @@ static std::vector CastAttrsToTragetType( src.size())); for (size_t i = 0; i < src.size(); i++) { size_t end = attrs_names[i].find(": "); - std::string type_name = - attrs_names[i].substr(end + 2, attrs_names.size() - end - 2); + std::string type_name = attrs_names[i].substr(end + 2); if (type_name == "int") { if (src[i].type() == typeid(bool)) { res.emplace_back(static_cast(paddle::any_cast(src[i]))); @@ -373,8 +372,9 @@ static PyObject* eager_api_jit_function_call(PyObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY - std::shared_ptr function = - CastPyArg2BaseFunction(PyTuple_GET_ITEM(args, 0), 0); + + std::shared_ptr function = + CastPyArg2JitFunction(PyTuple_GET_ITEM(args, 0), 0); std::vector ins = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1); std::vector outs = (*function)(ins); @@ -870,10 +870,13 @@ static PyObject* eager_api_to_uva_tensor(PyObject* self, PyObject* obj = PyTuple_GET_ITEM(args, 0); auto array = py::cast(py::handle(obj)); - int device_id = 0; - PyObject* Py_device_id = PyTuple_GET_ITEM(args, 1); - if (Py_device_id) { - device_id = CastPyArg2AttrLong(Py_device_id, 1); + Py_ssize_t args_num = PyTuple_Size(args); + int64_t device_id = 0; + if (args_num > 1) { + PyObject* Py_device_id = PyTuple_GET_ITEM(args, 1); + if (Py_device_id) { + device_id = CastPyArg2AttrLong(Py_device_id, 1); + } } if (py::isinstance>(array)) { diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 253291256ef86..6c1dea40b7814 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -22,8 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope_guard.h" -#include "paddle/fluid/jit/executor_function.h" -#include "paddle/fluid/jit/pe_function.h" +#include "paddle/fluid/jit/function.h" #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/utils.h" @@ -54,8 +53,7 @@ extern PyTypeObject* g_customplace_pytype; extern PyTypeObject* g_framework_tensor_pytype; extern PyTypeObject* g_framework_lodtensorarray_pytype; extern PyTypeObject* g_custom_op_kernel_ctx_pytype; -extern PyTypeObject* g_executor_function_pytype; -extern PyTypeObject* g_pe_function_pytype; +extern PyTypeObject* g_jit_function_pytype; int TensorDtype2NumpyDtype(phi::DataType dtype) { switch (dtype) { @@ -232,19 +230,15 @@ std::shared_ptr CastPyArg2VarBase(PyObject* obj, return py::cast>(obj); } -std::shared_ptr CastPyArg2BaseFunction(PyObject* obj, - ssize_t arg_pos) { - if (PyObject_IsInstance( - obj, reinterpret_cast(g_executor_function_pytype))) { - return ::pybind11::handle(obj) - .cast>(); - } else if (PyObject_IsInstance( - obj, reinterpret_cast(g_pe_function_pytype))) { - return ::pybind11::handle(obj).cast>(); +std::shared_ptr CastPyArg2JitFunction(PyObject* obj, + ssize_t arg_pos) { + if (PyObject_IsInstance(obj, + reinterpret_cast(g_jit_function_pytype))) { + return ::pybind11::handle(obj).cast>(); } else { PADDLE_THROW(platform::errors::InvalidArgument( "argument (position %d) must be " - "BaseFunction, but got %s", + "BaseEngine, but got %s", arg_pos + 1, reinterpret_cast(obj->ob_type)->tp_name)); } @@ -1241,6 +1235,9 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj, } else if (PyObject_CheckLongOrToLong(&obj)) { int value = CastPyArg2Int(obj, op_type, arg_pos); return paddle::experimental::Scalar(value); + } else if (PyObject_CheckString(obj)) { + std::string value = CastPyArg2String(obj, op_type, arg_pos); + return paddle::experimental::Scalar(value); } else { PADDLE_THROW(platform::errors::InvalidArgument( "%s(): argument (position %d) must be " diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 4fab8534b7b76..df959b9abf4f1 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -20,7 +20,7 @@ typedef SSIZE_T ssize_t; #include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/jit/base_function.h" +#include "paddle/fluid/jit/function.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" @@ -75,8 +75,8 @@ framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj, std::unordered_map CastPyArg2Vocab(PyObject* obj, ssize_t arg_pos); std::vector CastPyArg2Strings(PyObject* obj, ssize_t arg_pos); -std::shared_ptr CastPyArg2BaseFunction(PyObject* obj, - ssize_t arg_pos); +std::shared_ptr CastPyArg2JitFunction(PyObject* obj, + ssize_t arg_pos); PyObject* ToPyObject(int value); PyObject* ToPyObject(uint32_t value); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 26332084ac9f1..34bfd385d4c7b 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -670,7 +670,7 @@ void BindImperative(py::module *m_ptr) { .def("__init__", [](imperative::VarBase &self, framework::proto::VarType::Type dtype, - const std::vector &dims, + const std::vector &dims, const py::handle &name, framework::proto::VarType::Type type, bool persistable) { diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 3d2595860353e..14975ac337aed 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -687,7 +687,7 @@ void BindAnalysisConfig(py::module *m) { .def("specify_input_name", &AnalysisConfig::specify_input_name) .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine, - py::arg("workspace_size") = 1 << 20, + py::arg("workspace_size") = 1 << 30, py::arg("max_batch_size") = 1, py::arg("min_subgraph_size") = 3, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, diff --git a/paddle/fluid/pybind/jit.cc b/paddle/fluid/pybind/jit.cc index 79576e6547f9a..a9c844093d1a5 100644 --- a/paddle/fluid/pybind/jit.cc +++ b/paddle/fluid/pybind/jit.cc @@ -18,10 +18,9 @@ limitations under the License. */ #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/jit/executor_function.h" +#include "paddle/fluid/jit/function.h" #include "paddle/fluid/jit/function_schema.h" #include "paddle/fluid/jit/layer.h" -#include "paddle/fluid/jit/pe_function.h" #include "paddle/fluid/jit/serializer.h" namespace py = pybind11; @@ -29,27 +28,18 @@ namespace py = pybind11; namespace paddle { namespace pybind { -PyTypeObject *g_executor_function_pytype = nullptr; -PyTypeObject *g_pe_function_pytype = nullptr; +PyTypeObject *g_jit_function_pytype = nullptr; using Variable = paddle::framework::Variable; void BindJit(pybind11::module *m) { py::class_(*m, "Layer", R"DOC(Layer Class.)DOC") - .def("function_dict", - &jit::Layer::FunctionMap, - py::return_value_policy::reference); + .def("function_names", &jit::Layer::FunctionNames) + .def("function", &jit::Layer::Function) + .def("function_info", &jit::Layer::FunctionInfo); - py::class_> - executor_function( - *m, "ExectorFunction", R"DOC(ExectorFunction Class.)DOC"); - g_executor_function_pytype = - reinterpret_cast(executor_function.ptr()); - executor_function.def("info", &jit::ExecutorFunction::Info); - - py::class_> pe_function( - *m, "PEFunction", R"DOC(PEFunction Class.)DOC"); - g_pe_function_pytype = reinterpret_cast(pe_function.ptr()); - pe_function.def("info", &jit::PEFunction::Info); + py::class_> function( + *m, "Function", R"DOC(Function Class.)DOC"); + g_jit_function_pytype = reinterpret_cast(function.ptr()); py::class_>( *m, "FunctionInfo", R"DOC(FunctionInfo Class.)DOC") diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc index 25dbf5ebb2f62..2f1408257ff1b 100644 --- a/paddle/fluid/pybind/op_function_common.cc +++ b/paddle/fluid/pybind/op_function_common.cc @@ -87,7 +87,8 @@ bool PyObject_CheckLongOrToLong(PyObject** obj) { bool PyObject_CheckFloatOrToFloat(PyObject** obj) { // sometimes users provide PyLong or numpy.int64 but attr is float if (PyFloat_Check(*obj) || PyLong_Check(*obj) || - PyObject_IsInstance(*obj, (PyObject*)g_varbase_pytype)) { // NOLINT + PyObject_IsInstance(*obj, (PyObject*)g_varbase_pytype) || // NOLINT + PyObject_IsInstance(*obj, (PyObject*)p_tensor_type)) { // NOLINT return true; } if (std::string(((PyTypeObject*)(*obj)->ob_type)->tp_name) // NOLINT diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h index 8f66d258edac4..588e5521e6070 100644 --- a/paddle/fluid/pybind/op_function_generator.h +++ b/paddle/fluid/pybind/op_function_generator.h @@ -225,6 +225,7 @@ std::map> op_ins_map = { "Bias3", "Mean3", "Var3"}}, + {"graph_send_recv", {"X", "Src_index", "Dst_index", "Out_size"}}, }; // NOTE(zhiqiu): Like op_ins_map. diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 4cdf135b8ed59..cc16b895448c9 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -84,6 +84,7 @@ void BindProgramDesc(pybind11::module *m) { .def("get_feed_target_names", &pd::ProgramDesc::GetFeedTargetNames) .def("get_fetch_target_names", &pd::ProgramDesc::GetFetchTargetNames) .def("serialize_to_string", SerializeMessage) + .def("need_update", &pd::ProgramDesc::NeedUpdate) .def("parse_from_string", [](pd::ProgramDesc &program_desc, const std::string &data) { pd::proto::ProgramDesc *desc = program_desc.Proto(); @@ -289,7 +290,9 @@ void BindOpDesc(pybind11::module *m) { .value("BOOL", pd::proto::AttrType::BOOLEAN) .value("BOOLS", pd::proto::AttrType::BOOLEANS) .value("BLOCK", pd::proto::AttrType::BLOCK) - .value("BLOCKS", pd::proto::AttrType::BLOCKS); + .value("BLOCKS", pd::proto::AttrType::BLOCKS) + .value("VAR", pd::proto::AttrType::VAR) + .value("VARS", pd::proto::AttrType::VARS); pybind11::class_ op_desc(*m, "OpDesc", ""); op_desc @@ -300,8 +303,16 @@ void BindOpDesc(pybind11::module *m) { .def("copy_from", &pd::OpDesc::CopyFrom) .def("type", &pd::OpDesc::Type) .def("set_type", &pd::OpDesc::SetType) - .def("input", &pd::OpDesc::Input) - .def("input_names", &pd::OpDesc::InputNames) + .def("input", + [](pd::OpDesc &self, const std::string &name) { + return self.Input(name); + }) + .def( + "input_names", + [](pd::OpDesc &self, bool with_attr_var) { + return self.InputNames(with_attr_var); + }, + py::arg("with_attr_var") = false) .def("output", &pd::OpDesc::Output) .def("output_names", &pd::OpDesc::OutputNames) .def("set_input", @@ -318,16 +329,46 @@ void BindOpDesc(pybind11::module *m) { }) .def("remove_output", &pd::OpDesc::RemoveOutput) .def("remove_input", &pd::OpDesc::RemoveInput) - .def("input_arg_names", &pd::OpDesc::InputArgumentNames) + .def( + "input_arg_names", + [](pd::OpDesc &self, bool with_attr_var) { + return self.InputArgumentNames(with_attr_var); + }, + py::arg("with_attr_var") = false) .def("output_arg_names", &pd::OpDesc::OutputArgumentNames) .def("_rename_input", &pd::OpDesc::RenameInput) .def("_rename_output", &pd::OpDesc::RenameOutput) - .def("has_attr", &pd::OpDesc::HasAttr) - .def("attr_type", &pd::OpDesc::GetAttrType) - .def("attr_names", &pd::OpDesc::AttrNames) + .def( + "has_attr", + [](pd::OpDesc &self, const std::string &name, bool with_attr_var) { + return self.HasAttr(name, with_attr_var); + }, + py::arg("name"), + py::arg("with_attr_var") = false) + .def( + "attr_type", + [](pd::OpDesc &self, const std::string &name, bool with_attr_var) { + return self.GetAttrType(name, with_attr_var); + }, + py::arg("name"), + py::arg("with_attr_var") = false) + .def( + "attr_names", + [](pd::OpDesc &self, bool with_attr_var) { + return self.AttrNames(with_attr_var); + }, + py::arg("with_attr_var") = false) .def("_set_attr", &pd::OpDesc::SetAttr) .def("remove_attr", &pd::OpDesc::RemoveAttr) - .def("attr", &pd::OpDesc::GetAttr) + .def( + "attr", + [](pd::OpDesc &self, const std::string &name, bool with_attr_var) { + return self.GetAttr(name, with_attr_var); + }, + py::arg("name"), + py::arg("with_attr_var") = false) + .def("set_var_attr", &pd::OpDesc::SetVarAttr) + .def("set_vars_attr", &pd::OpDesc::SetVarsAttr) .def("set_block_attr", &pd::OpDesc::SetBlockAttr) .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr) .def("set_serialized_attr", @@ -351,7 +392,7 @@ void BindOpDesc(pybind11::module *m) { .def("id", &pd::OpDesc::Id) .def("original_id", &pd::OpDesc::OriginalId) .def("set_original_id", &pd::OpDesc::SetOriginalId) - .def("inputs", &pd::OpDesc::Inputs) + .def("inputs", [](pd::OpDesc &self) { return self.Inputs(); }) .def("outputs", &pd::OpDesc::Outputs); } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index f93e9b6de9221..5575d839a2f79 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -331,8 +331,9 @@ bool SupportsVNNI() { bool IsCompiledWithBrpc() { #ifndef PADDLE_WITH_DISTRIBUTE return false; -#endif +#else return true; +#endif } bool IsCompiledWithDIST() { @@ -1166,34 +1167,33 @@ All parameter, weight, gradient are variables in Paddle. .def("empty", []() { return kEmptyVarName; }) .def("temp", []() { return kTempVarName; }); - // clang-format off py::class_(m, "DeviceContext") .def_static("create", - [](paddle::platform::CPUPlace& place) - -> paddle::platform::DeviceContext* { - auto* context = new phi::CPUContext(); - context->SetAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(place) - .get()); - context->SetHostAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(paddle::platform::CPUPlace()) - .get()); - context->SetZeroAllocator( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetZeroAllocator(place) - .get()); - return context; + [](paddle::platform::CPUPlace &place) + -> paddle::platform::DeviceContext * { + auto *context = new phi::CPUContext(); + context->SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place) + .get()); + context->SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + context->SetZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(place) + .get()); + return context; }) - .def_static("create", - [](paddle::platform::XPUPlace& place) - -> paddle::platform::DeviceContext* { + .def_static( + "create", + [](paddle::platform::XPUPlace &place) + -> paddle::platform::DeviceContext * { #ifndef PADDLE_WITH_XPU - PADDLE_THROW( - platform::errors::PermissionDenied( - "Cannot use XPUPlace in CPU/GPU version, " - "Please recompile or reinstall Paddle with XPU support.")); + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use XPUPlace in CPU/GPU version, " + "Please recompile or reinstall Paddle with XPU support.")); #else auto* context = new paddle::platform::XPUDeviceContext(place); context->SetAllocator( @@ -1210,52 +1210,51 @@ All parameter, weight, gradient are variables in Paddle. .get()); return context; #endif - }) - .def_static("create", - [](paddle::platform::MLUPlace& place) - -> paddle::platform::DeviceContext* { + }) + .def_static( + "create", + [](paddle::platform::MLUPlace &place) + -> paddle::platform::DeviceContext * { #ifndef PADDLE_WITH_MLU - PADDLE_THROW( - platform::errors::PermissionDenied( - "Cannot use MLUPlace in CPU/GPU version, " - "Please recompile or reinstall Paddle with MLU support.")); + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use MLUPlace in CPU/GPU version, " + "Please recompile or reinstall Paddle with MLU support.")); #else return new paddle::platform::MLUDeviceContext(place); #endif - }) - .def_static("create", - [](paddle::platform::NPUPlace& place) - -> paddle::platform::DeviceContext* { + }) + .def_static( + "create", + [](paddle::platform::NPUPlace &place) + -> paddle::platform::DeviceContext * { #ifndef PADDLE_WITH_ASCEND_CL - PADDLE_THROW( - platform::errors::PermissionDenied( - "Cannot use NPUPlace in CPU/GPU/XPU version, " - "Please recompile or reinstall Paddle with NPU support.")); + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use NPUPlace in CPU/GPU/XPU version, " + "Please recompile or reinstall Paddle with NPU support.")); #else return new paddle::platform::NPUDeviceContext(place); #endif - }) - .def_static("create", - [](paddle::platform::CustomPlace& place) - -> paddle::platform::DeviceContext* { + }) + .def_static("create", + [](paddle::platform::CustomPlace &place) + -> paddle::platform::DeviceContext * { #ifndef PADDLE_WITH_CUSTOM_DEVICE - PADDLE_THROW( - platform::errors::PermissionDenied( - "Cannot use CustomPlace in CPU/GPU/XPU version, " - "Please recompile or reinstall Paddle with " - "CustomDevice support.")); + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use CustomPlace in CPU/GPU/XPU version, " + "Please recompile or reinstall Paddle with " + "CustomDevice support.")); #else return new paddle::platform::CustomDeviceContext(place); #endif - }) - .def_static("create", - [](paddle::platform::CUDAPlace& place) - -> paddle::platform::DeviceContext* { + }) + .def_static( + "create", + [](paddle::platform::CUDAPlace &place) + -> paddle::platform::DeviceContext * { #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) - PADDLE_THROW( - platform::errors::PermissionDenied( - "Cannot use CUDAPlace in CPU only version, " - "Please recompile or reinstall Paddle with CUDA support.")); + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use CUDAPlace in CPU only version, " + "Please recompile or reinstall Paddle with CUDA support.")); #else auto* context = new phi::GPUContext(place); context->SetAllocator( @@ -1277,20 +1276,19 @@ All parameter, weight, gradient are variables in Paddle. context->PartialInitWithAllocator(); return context; #endif - }) - .def_static("create", - [](paddle::platform::CUDAPinnedPlace& place) - -> paddle::platform::DeviceContext* { + }) + .def_static( + "create", + [](paddle::platform::CUDAPinnedPlace &place) + -> paddle::platform::DeviceContext * { #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) - PADDLE_THROW( - platform::errors::PermissionDenied( - "Cannot use CUDAPinnedPlace in CPU only version, " - "Please recompile or reinstall Paddle with CUDA support.")); + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use CUDAPinnedPlace in CPU only version, " + "Please recompile or reinstall Paddle with CUDA support.")); #else return new paddle::platform::CUDAPinnedDeviceContext(place); #endif - });; -// clang-format on + }); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) py::class_(m, "Communicator").def(py::init<>()); #endif diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h index 3ea07eea244c6..b884c0190981f 100644 --- a/paddle/fluid/pybind/slice_utils.h +++ b/paddle/fluid/pybind/slice_utils.h @@ -191,10 +191,10 @@ static void ParseIndexingSlice(framework::LoDTensor* tensor, PyObject* slice_item = PyTuple_GetItem(index, i); infer_flags->push_back(1); - int dim_len = shape[dim]; + int64_t dim_len = shape[dim]; if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) { // integer, PyLong_AsLong supports both int and long - int start = static_cast(PyLong_AsLong(slice_item)); + int64_t start = static_cast(PyLong_AsLong(slice_item)); auto s_t = start; start = start < 0 ? start + dim_len : start; diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index ccec0c060a3a4..3b0a9f8fb0ce2 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -368,7 +368,7 @@ void SetTensorFromPyArrayT( std::vector dims; dims.reserve(array.ndim()); for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { - dims.push_back(static_cast(array.shape()[i])); + dims.push_back(static_cast(array.shape()[i])); } self->Resize(phi::make_ddim(dims)); @@ -612,8 +612,8 @@ void SetUVATensorFromPyArrayImpl(framework::LoDTensor *self_tensor, dims.reserve(array.ndim()); int64_t numel = 1; for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { - dims.emplace_back(static_cast(array.shape()[i])); - numel *= static_cast(array.shape()[i]); + dims.emplace_back(static_cast(array.shape()[i])); + numel *= static_cast(array.shape()[i]); } self_tensor->Resize(phi::make_ddim(dims)); diff --git a/paddle/infrt/common/type.h b/paddle/infrt/common/type.h index b532fc154ff02..70dd2c5cb4662 100644 --- a/paddle/infrt/common/type.h +++ b/paddle/infrt/common/type.h @@ -172,20 +172,54 @@ const Type& UI1(); template Type type_of(); -// clang-format off -template <> inline Type type_of() { return F32(); } -template <> inline Type type_of() { return F64(); } -template <> inline Type type_of() { return UI8(); } -template <> inline Type type_of() { return UI16(); } -template <> inline Type type_of() { return I32(); } -template <> inline Type type_of() { return UI32(); } -template <> inline Type type_of() { return UI1(); } -template <> inline Type type_of() { return I8(); } -template <> inline Type type_of() { return I64(); } -template <> inline Type type_of() { return UI64(); } -template <> inline Type type_of() { return I8(); } -template <> inline Type type_of() { return Void(); } -// clang-format on +template <> +inline Type type_of() { + return F32(); +} +template <> +inline Type type_of() { + return F64(); +} +template <> +inline Type type_of() { + return UI8(); +} +template <> +inline Type type_of() { + return UI16(); +} +template <> +inline Type type_of() { + return I32(); +} +template <> +inline Type type_of() { + return UI32(); +} +template <> +inline Type type_of() { + return UI1(); +} +template <> +inline Type type_of() { + return I8(); +} +template <> +inline Type type_of() { + return I64(); +} +template <> +inline Type type_of() { + return UI64(); +} +template <> +inline Type type_of() { + return I8(); +} +template <> +inline Type type_of() { + return Void(); +} template <> inline Type type_of() { Type x = Int(8); diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 88fefb8eac99d..056b9d79c84e2 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -714,7 +714,11 @@ std::vector split_impl(const Tensor& x, // Calculate the number of out tensors size_t out_number; if (num_or_sections.size() == 1) { - out_number = num_or_sections.GetData()[0]; + if (num_or_sections.GetData()[0] < 0) { + out_number = 1; + } else { + out_number = num_or_sections.GetData()[0]; + } } else { out_number = num_or_sections.size(); } diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index 3b44b1876e20d..4f5ecf0aee119 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -// clang-format off #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" @@ -24,7 +23,6 @@ limitations under the License. */ #include "paddle/phi/kernels/transfer_layout_kernel.h" #include "paddle/fluid/framework/tensor_util.h" -// clang-format on namespace paddle { namespace experimental { diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index cf528beb800ba..70ee28bc2561e 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -// clang-format off #include "paddle/phi/api/include/tensor.h" #include @@ -35,7 +34,6 @@ limitations under the License. */ #include "paddle/phi/core/tensor_base.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/tensor_utils.h" -// clang-format off namespace paddle { namespace experimental { @@ -312,8 +310,8 @@ void Tensor::set_impl(std::shared_ptr &&impl) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t Tensor::stream() const { int device_id = phi::backends::gpu::GetCurrentDeviceId(); - auto* gpu_context = DeviceContextPool::Instance() - .Get(GPUPlace(device_id)); + auto *gpu_context = DeviceContextPool::Instance().Get( + GPUPlace(device_id)); return gpu_context->stream(); } #endif diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index 6d38bbda36310..037cee79b637b 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -// clang-format off #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/common/int_array.h" @@ -25,7 +24,6 @@ limitations under the License. */ #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/infermeta/unary.h" -// clang-format off namespace paddle { namespace experimental { @@ -115,13 +113,12 @@ void Tensor::copy_(const Tensor &src, // Deep Copy AutoGrad info from src to self. *autograd_meta_ = *(src.autograd_meta_); } - kernel_key_set.backend_set = - kernel_key_set.backend_set | - BackendSet(phi::TransToPhiBackend(target_place)); + kernel_key_set.backend_set = kernel_key_set.backend_set | + BackendSet(phi::TransToPhiBackend(target_place)); auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); auto place = phi::TransToPhiPlace(kernel_key.backend()); - auto& pool = paddle::experimental::DeviceContextPool::Instance(); - auto* dev_ctx = pool.GetMutable( + auto &pool = paddle::experimental::DeviceContextPool::Instance(); + auto *dev_ctx = pool.GetMutable( place.GetType() == target_place.GetType() ? target_place : place); Backend kernel_backend = Backend::UNDEFINED; diff --git a/paddle/phi/api/yaml/api.yaml b/paddle/phi/api/yaml/api.yaml index 1156206ee4b51..12ea231a939d1 100644 --- a/paddle/phi/api/yaml/api.yaml +++ b/paddle/phi/api/yaml/api.yaml @@ -98,6 +98,43 @@ func : erf backward : erf_grad +- api : erfinv + args : (Tensor x) + output : Tensor(out) + infer_meta : + func : UnchangedInferMeta + kernel : + func : erfinv + inplace : (x -> out) + backward : erfinv_grad + +- api : fft_c2c + args : (Tensor x, int64_t[] axes, str normalization, bool forward) + output : Tensor + infer_meta : + func : FFTC2CInferMeta + kernel : + func : fft_c2c + backward : fft_c2c_grad + +- api : fft_c2r + args : (Tensor x, int64_t[] axes, str normalization, bool forward, int64_t last_dim_size=0L) + output : Tensor + infer_meta : + func : FFTC2RInferMeta + kernel : + func : fft_c2r + backward : fft_c2r_grad + +- api : fft_r2c + args : (Tensor x, int64_t[] axes, str normalization, bool forward, bool onesided) + output : Tensor + infer_meta : + func : FFTR2CInferMeta + kernel : + func : fft_r2c + backward : fft_r2c_grad + - api : lgamma args : (Tensor x) output : Tensor(out) @@ -105,7 +142,7 @@ func : UnchangedInferMeta kernel : func : lgamma - backward : lgamma_grad + backward : lgamma_grad - api : mv args : (Tensor x, Tensor vec) diff --git a/paddle/phi/api/yaml/api_compat.yaml b/paddle/phi/api/yaml/api_compat.yaml index 5300d551f8ef8..9ac1e8bd719be 100644 --- a/paddle/phi/api/yaml/api_compat.yaml +++ b/paddle/phi/api/yaml/api_compat.yaml @@ -31,6 +31,15 @@ float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false, int workspace_size_MB = 512, bool exhaustive_search = false] +- api : conv2d + extra : + attrs : [bool use_cudnn = false, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false, + bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false, + str fuse_activation = "", bool fuse_alpha = false, bool fuse_beta = false, bool use_addto = false, + bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f, + float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false, + int workspace_size_MB = 512, bool exhaustive_search = false] + - api : cross inputs : {x : X, y : Y} @@ -77,6 +86,12 @@ outputs : out : Out +- api : erfinv + inputs : + x : X + outputs : + out : Out + - api : lgamma inputs : x : X @@ -112,3 +127,15 @@ x : X outputs : out : Out + +- api: fft_c2c + inputs: {x: X} + outputs: {out: Out} + +- api: fft_c2r + inputs: {x: X} + outputs: {out: Out} + +- api: fft_r2c + inputs: {x: X} + outputs: {out: Out} diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index 53cdc97a716d7..1b1c0be7aa9fb 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -105,6 +105,48 @@ func : erf_grad data_type : out_grad +- backward_api : erfinv_grad + forward : erfinv (Tensor x) -> Tensor(out) + args : (Tensor out, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [out] + kernel : + func : erfinv_grad + +- backward_api : fft_c2c_grad + forward: fft_c2c(Tensor x, int64_t[] axes, str normalization, bool forward) -> Tensor(out) + args : (Tensor out_grad, int64_t[] axes, str normalization, bool forward) + output: Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [out_grad] + kernel : + func : fft_c2c_grad + +- backward_api : fft_c2r_grad + forward: fft_c2r(Tensor x, int64_t[] axes, str normalization, bool forward, int64_t last_dim_size) -> Tensor(out) + args : (Tensor out_grad, int64_t[] axes, str normalization, bool forward, int64_t last_dim_size) + output: Tensor(x_grad) + infer_meta : + func : FFTC2RGradInferMeta + kernel : + func : fft_c2r_grad + data_type: out_grad + +- backward_api : fft_r2c_grad + forward: fft_r2c(Tensor x, int64_t[] axes, str normalization, bool forward, bool onesided) -> Tensor(out) + args : (Tensor x, Tensor out_grad, int64_t[] axes, str normalization, bool forward, bool onesided) + output: Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : fft_r2c_grad + data_type: out_grad + no_need_buffer: x + - backward_api : lgamma_grad forward : lgamma(Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) diff --git a/paddle/phi/api/yaml/generator/api_gen.py b/paddle/phi/api/yaml/generator/api_gen.py index 64c2fac85cd00..dc4581472f847 100644 --- a/paddle/phi/api/yaml/generator/api_gen.py +++ b/paddle/phi/api/yaml/generator/api_gen.py @@ -130,7 +130,7 @@ def gene_return_code(self): selected_code = [ f"std::get<{i}>(api_output)" for i in return_out_list ] - return 'return {' + ", ".join(selected_code) + '};' + return 'return std::make_tuple(' + ", ".join(selected_code) + ');' def gene_output(self, out_dtype_list, diff --git a/paddle/phi/api/yaml/generator/filters.py b/paddle/phi/api/yaml/generator/filters.py index cda858ab6e74e..de9fdf25e9834 100644 --- a/paddle/phi/api/yaml/generator/filters.py +++ b/paddle/phi/api/yaml/generator/filters.py @@ -86,9 +86,9 @@ def to_opmaker_name(s): def to_opmaker_name_cstr(s): if s.endswith("_grad"): - return '"{}@GRAD"'.format(to_pascal_case(s[:-5])) + return '"{}@GRAD"'.format(s[:-5]) else: - return '"{}"'.format(to_pascal_case(s)) + return '"{}"'.format(s) def to_pascal_case(s): diff --git a/paddle/phi/api/yaml/generator/generate_op.py b/paddle/phi/api/yaml/generator/generate_op.py index bc8b80efb5156..ac43db18e57c1 100644 --- a/paddle/phi/api/yaml/generator/generate_op.py +++ b/paddle/phi/api/yaml/generator/generate_op.py @@ -111,6 +111,7 @@ def replace_compat_name(api_op_map, forward_api_dict, backward_api_dict): key = args_map[key] if val in args_map: val = args_map[val] + key, val = val, key inplace_map[key] = val forward_api_item['inplace'] = inplace_map diff --git a/paddle/phi/api/yaml/generator/tests.py b/paddle/phi/api/yaml/generator/tests.py index d322fe1885baa..c96e6ba6ef4ac 100644 --- a/paddle/phi/api/yaml/generator/tests.py +++ b/paddle/phi/api/yaml/generator/tests.py @@ -50,7 +50,7 @@ def supports_selected_rows_kernel(api): def supports_inplace(api): - return "inplace_map" in api + return api['inplace'] is not None def supports_no_need_buffer(api): diff --git a/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py b/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py index dfa6a7f93cbcb..0504d3fd10891 100644 --- a/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py +++ b/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py @@ -18,6 +18,8 @@ from api_gen import ForwardAPI +kernel_func_set = set() + def get_wrapped_infermeta_name(api_name): return api_name.capitalize() + 'InferMeta' @@ -29,6 +31,9 @@ def gene_wrapped_infermeta_and_register(api): PD_REGISTER_INFER_META_FN({api.kernel['func'][0]}, phi::{api.infer_meta['func']});""" if api.infer_meta['param'] is not None: + if api.kernel['func'][0] in kernel_func_set: + return '', '', '' + kernel_params = api.kernel['param'] if kernel_params is None: kernel_params = api.inputs['names'] + api.attrs['names'] @@ -78,6 +83,7 @@ def gene_wrapped_infermeta_and_register(api): register_code = f""" PD_REGISTER_INFER_META_FN({api.kernel['func'][0]}, phi::{get_wrapped_infermeta_name(api.kernel['func'][0])});""" + kernel_func_set.add(api.kernel['func'][0]) return declare_code, defind_code, register_code else: return '', '', register_code diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml index b4ca7148a4080..d58acfd77e203 100755 --- a/paddle/phi/api/yaml/legacy_api.yaml +++ b/paddle/phi/api/yaml/legacy_api.yaml @@ -455,6 +455,14 @@ func : celu backward : celu_grad +- api : class_center_sample + args : (Tensor label, int num_classes, int num_samples, int ring_id, int rank, int nranks, bool fix_seed, int seed) + output : Tensor(remapped_label), Tensor(sampled_local_class_center) + infer_meta : + func : ClassCenterSampleInferMeta + kernel : + func : class_center_sample + - api : clip args : (Tensor x, Scalar(float) min, Scalar(float) max) output : Tensor(out) @@ -667,7 +675,7 @@ backward : divide_grad - api : dropout - args : (Tensor x, Tensor seed_tensor, float p, bool is_test, str mode, int seed, bool fix_seed) + args : (Tensor x, Tensor seed_tensor, Scalar p, bool is_test, str mode, int seed, bool fix_seed) output : Tensor(out), Tensor(mask) infer_meta : func : DropoutInferMeta @@ -790,16 +798,6 @@ kernel : func : equal_all -- api : erfinv - args : (Tensor x) - output : Tensor(out) - infer_meta : - func : UnchangedInferMeta - kernel : - func : erfinv - inplace : (x -> out) - backward : erfinv_grad - # exp - api : exp args : (Tensor x) @@ -864,6 +862,17 @@ data_type : dtype backend : place +- api : fill + args : (Tensor x, Scalar value) + output : Tensor(out) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : fill + inplace : (x -> out) + backward: fill_grad + - api : fill_diagonal args : (Tensor x, float value, int offset, bool wrap) output : Tensor(out) @@ -874,6 +883,16 @@ inplace : (x -> out) backward : fill_diagonal_grad +- api : fill_diagonal_tensor + args : (Tensor x, Tensor y, int64_t offset, int dim1, int dim2) + output : Tensor(out) + infer_meta : + func : FillDiagonalTensorInferMeta + kernel : + func : fill_diagonal_tensor + inplace : (x -> out) + backward : fill_diagonal_tensor_grad + - api : flatten args : (Tensor x, int start_axis, int stop_axis) output : Tensor(out), Tensor(xshape) @@ -964,6 +983,20 @@ data_type : dtype backend : place +# full +- api : full_ + args : (Tensor output, IntArray shape, Scalar value, DataType dtype=DataType::FLOAT32, Place place=CPUPlace()) + output : Tensor(out) + inplace : (output -> out) + infer_meta : + func : CreateInferMeta + param : [shape, dtype] + kernel : + func : full + param : [shape, value, dtype] + data_type : dtype + backend : place + - api : full_batch_size_like args : (Tensor input, int[] shape, DataType dtype, Scalar value, int input_dim_idx, int output_dim_idx, Place place=CPUPlace()) output: Tensor @@ -1040,8 +1073,16 @@ func : gelu backward : gelu_grad +- api : generate_proposals_v2 + args : (Tensor scores, Tensor bbox_deltas, Tensor im_shape, Tensor anchors, Tensor variances, int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, float eta, bool pixel_offset=true) + output : Tensor(rpn_rois), Tensor(rpn_roi_probs), Tensor(rpn_rois_num) + infer_meta : + func : GenerateProposalsV2InferMeta + kernel : + func : generate_proposals_v2 + - api : graph_send_recv - args : (Tensor x, Tensor src_index, Tensor dst_index, str pool_type = "SUM", int64_t out_size = 0) + args : (Tensor x, Tensor src_index, Tensor dst_index, str pool_type = "SUM", IntArray out_size = {0}) output : Tensor(out), Tensor(dst_count) infer_meta : func : GraphSendRecvInferMeta @@ -1309,6 +1350,18 @@ optional : prior_dist backward : label_smooth_grad +- api : lamb_ + args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, float weight_decay, float beta1, float beta2, float epsilon, bool multi_precision) + output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_outs) + infer_meta : + func : LambInferMeta + kernel : + func : lamb {dense, dense, dense, dense, dense, dense, dense, dense, dense -> dense, dense, dense, dense, dense, dense}, + lamb_sr {dense, selected_rows, dense, dense, dense, dense, dense, dense, dense -> dense, dense, dense, dense, dense, dense} + data_type : param + optional : master_param, skip_update + inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_outs) + - api : layer_norm args : (Tensor x, Tensor scale, Tensor bias, float epsilon, int begin_norm_axis, bool is_test) output : Tensor(out), Tensor(mean), Tensor(variance) @@ -1523,6 +1576,16 @@ data_type : x backward : lu_unpack_grad +- api : margin_cross_entropy + args : (Tensor logits, Tensor label, bool return_softmax, int ring_id, int rank, int nranks, float margin1, float margin2, float margin3, float scale) + output : Tensor(softmax), Tensor(loss) + infer_meta : + func : MarginCrossEntropyInferMeta + kernel : + func : margin_cross_entropy + data_type : logits + backward : margin_cross_entropy_grad + # masked_select - api : masked_select args : (Tensor x, Tensor mask) @@ -2474,6 +2537,7 @@ func : BatchNormInferMeta kernel : func : sync_batch_norm + data_type : x backward : sync_batch_norm_grad # take_along_axis @@ -2781,6 +2845,16 @@ func: eig backward: eig_grad +# fold +- api: fold + args: (Tensor x, int[] output_sizes, int[] kernel_sizes, int[] strides, int[] paddings, int[] dilations) + output: Tensor(out) + infer_meta: + func: FoldInferMeta + kernel: + func: fold + backward: fold_grad + # overlap_add - api: overlap_add args: (Tensor x, int hop_length, int axis) @@ -2801,3 +2875,25 @@ data_type: x inplace: (x -> out) backward: uniform_random_inplace_grad + +# unpool +- api: unpool + args: (Tensor x, Tensor indices, int[] ksize, int[] strides, int[] padding, int[] output_size, str data_format) + output: Tensor(out) + infer_meta: + func: UnpoolInferMeta + kernel: + func: unpool + data_type: x + backward: unpool_grad + +# unpool3d +- api: unpool3d + args: (Tensor x, Tensor indices, int[] ksize, int[] strides, int[] padding, int[] output_size, str data_format) + output: Tensor(out) + infer_meta: + func: Unpool3dInferMeta + kernel: + func: unpool3d + data_type: x + backward: unpool3d_grad diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 8b43f7643c796..fdf2321ea38e1 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -644,8 +644,8 @@ backward : divide_double_grad - backward_api : dropout_grad - forward : dropout (Tensor x, Tensor seed_tensor, float p, bool is_test, str mode, int seed, bool fix_seed) -> Tensor(out), Tensor(mask) - args : (Tensor mask, Tensor out_grad, float p, bool is_test, str mode) + forward : dropout (Tensor x, Tensor seed_tensor, Scalar p, bool is_test, str mode, int seed, bool fix_seed) -> Tensor(out), Tensor(mask) + args : (Tensor mask, Tensor out_grad, Scalar p, bool is_test, str mode) output : Tensor(x_grad) infer_meta : func : UnchangedInferMeta @@ -740,16 +740,6 @@ output : Tensor(weight_grad) invoke : embedding_grad_impl(x, weight, out_grad, padding_idx, sparse, weight_grad) -- backward_api : erfinv_grad - forward : erfinv (Tensor x) -> Tensor(out) - args : (Tensor out, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [out] - kernel : - func : erfinv_grad - - backward_api : exp_grad forward : exp (Tensor x) -> Tensor(out) args : (Tensor out, Tensor out_grad) @@ -811,7 +801,7 @@ infer_meta : func : UnchangedInferMeta invoke : zeros_like(out_grad, DataType::UNDEFINED, {}) - + - backward_api : fill_diagonal_grad forward : fill_diagonal (Tensor x, float value, int offset, bool wrap) -> Tensor(out) args : (Tensor out_grad, float value, int offset, bool wrap) @@ -820,6 +810,26 @@ func : FillDiagonalGradInferMeta kernel : func : fill_diagonal_grad + +- backward_api : fill_diagonal_tensor_grad + forward : fill_diagonal_tensor (Tensor x, Tensor y, int64_t offset, int dim1, int dim2) -> Tensor(out) + args : (Tensor out_grad, int64_t offset, int dim1, int dim2) + output : Tensor(x_grad) + infer_meta : + func : FillDiagonalTensorGradInferMeta + kernel : + func : fill_diagonal_tensor_grad + inplace : (out_grad -> x_grad) + +- backward_api : fill_grad + forward : fill (Tensor x, Scalar value) -> Tensor(out) + args : (Tensor out_grad, Scalar value) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [out_grad] + kernel : + func : fill_grad inplace : (out_grad -> x_grad) - backward_api : flatten_grad @@ -931,7 +941,7 @@ func : gelu_grad - backward_api : graph_send_recv_grad - forward : graph_send_recv (Tensor x, Tensor src_index, Tensor dst_index, str pool_type = "SUM", int64_t out_size = 0) -> Tensor(out), Tensor(dst_count) + forward : graph_send_recv (Tensor x, Tensor src_index, Tensor dst_index, str pool_type = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count) args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str pool_type = "SUM") output : Tensor(x_grad) infer_meta : @@ -1316,6 +1326,17 @@ kernel : func : lu_unpack_grad +- backward_api : margin_cross_entropy_grad + forward : margin_cross_entropy (Tensor logits, Tensor label, bool return_softmax, int ring_id, int rank, int nranks, float margin1, float margin2, float margin3, float scale) -> Tensor(softmax), Tensor(loss) + args : (Tensor logits, Tensor label, Tensor softmax, Tensor loss_grad, bool return_softmax, int ring_id, int rank, int nranks, float margin1, float margin2, float margin3, float scale) + output : Tensor(logits_grad) + infer_meta : + func : MarginCrossEntropyGradInferMeta + kernel : + func : margin_cross_entropy_grad + data_type : softmax + inplace : (softmax -> logits_grad) + - backward_api : masked_select_grad forward : masked_select (Tensor x, Tensor mask) -> Tensor(out) args : (Tensor x, Tensor mask, Tensor out_grad) @@ -2625,3 +2646,41 @@ kernel : func : yolov3_loss_grad optional : gt_score + +# fold +- backward_api: fold_grad + forward: fold (Tensor x, int[] output_sizes, int[] kernel_sizes, int[] strides, int[] paddings, int[] dilations) -> Tensor(out) + args: (Tensor x, Tensor out_grad, int[] output_sizes, int[] kernel_sizes, int[] strides, int[] paddings, int[] dilations) + output: Tensor(x_grad) + infer_meta: + func: UnchangedInferMeta + param : [x] + kernel: + func: fold_grad + no_need_buffer : x + +# unpool3d +- backward_api: unpool3d_grad + forward: unpool3d (Tensor x, Tensor indices, int[] ksize, int[] strides, int[] padding, int[] output_size, str data_format) -> Tensor(out) + args: (Tensor x, Tensor indices, Tensor out, Tensor out_grad, int[] ksize, int[] strides, int[] padding, int[] output_size, str data_format) + output: Tensor(x_grad) + infer_meta: + func: UnchangedInferMeta + param : [x] + kernel: + func: unpool3d_grad + data_type: x + no_need_buffer : x + +# unpool +- backward_api: unpool_grad + forward: unpool (Tensor x, Tensor indices, int[] ksize, int[] strides, int[] padding, int[] output_size, str data_format) -> Tensor(out) + args: (Tensor x, Tensor indices, Tensor out, Tensor out_grad, int[] ksize, int[] strides, int[] padding, int[] output_size, str data_format) + output: Tensor(x_grad) + infer_meta: + func: UnchangedInferMeta + param : [x] + kernel: + func: unpool_grad + data_type: x + no_need_buffer : x diff --git a/paddle/phi/backends/gpu/cuda/cuda_info.cc b/paddle/phi/backends/gpu/cuda/cuda_info.cc index 7be21e85f0005..4b5de3db54d19 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_info.cc +++ b/paddle/phi/backends/gpu/cuda/cuda_info.cc @@ -241,6 +241,7 @@ void SetDeviceId(int id) { id, GetGPUDeviceCount())); PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id)); + VLOG(4) << "SetDeviceId " << id; } void GpuMemcpyAsync(void *dst, diff --git a/paddle/phi/backends/gpu/rocm/rocm_info.cc b/paddle/phi/backends/gpu/rocm/rocm_info.cc index b89d5a3c1624f..1646d9666ff42 100644 --- a/paddle/phi/backends/gpu/rocm/rocm_info.cc +++ b/paddle/phi/backends/gpu/rocm/rocm_info.cc @@ -301,14 +301,10 @@ bool IsGPUManagedMemorySupported(int dev_id) { "but received id is: %d. GPU count is: %d.", dev_id, GetGPUDeviceCount())); -#if defined(__linux__) || defined(_WIN32) - int ManagedMemoryAttr; - PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute( - &ManagedMemoryAttr, hipDeviceAttributeManagedMemory, dev_id)); - return ManagedMemoryAttr != 0; -#else + // TODO(qili93): Hygon DTK (21.04 and 22.04) not support + // hipDeviceAttributeManagedMemory, temporary disable by default, to be + // verified in next DTK release return false; -#endif } bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) { diff --git a/paddle/phi/backends/xpu/enforce_xpu.h b/paddle/phi/backends/xpu/enforce_xpu.h index 30095e3a0074a..44763d408f7d7 100644 --- a/paddle/phi/backends/xpu/enforce_xpu.h +++ b/paddle/phi/backends/xpu/enforce_xpu.h @@ -16,7 +16,9 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/backends/xpu/xpu_header.h" +#ifdef PADDLE_WITH_XPU_BKCL #include "xpu/bkcl.h" +#endif namespace phi { namespace backends { @@ -97,6 +99,7 @@ inline const char* xpuGetErrorString(int stat) { } } +#ifdef PADDLE_WITH_XPU_BKCL inline const char* bkclGetErrorString(BKCLResult_t stat) { switch (stat) { case BKCL_SUCCESS: @@ -113,6 +116,7 @@ inline const char* bkclGetErrorString(BKCLResult_t stat) { return "Unknown BKCL status"; } } +#endif inline const char* xdnnGetErrorString(int stat) { switch (stat) { @@ -136,10 +140,12 @@ inline std::string build_xpu_error_msg(int stat) { return msg + xpuGetErrorString(stat) + " "; } +#ifdef PADDLE_WITH_XPU_BKCL inline std::string build_xpu_error_msg(BKCLResult_t stat) { std::string msg("BKCL Error, "); return msg + bkclGetErrorString(stat) + " "; } +#endif inline std::string build_xpu_xdnn_error_msg(int stat, std::string msg) { return msg + " XDNN Error, " + xdnnGetErrorString(stat) + " "; @@ -158,7 +164,9 @@ struct ExternalApiType {}; } DEFINE_EXTERNAL_API_TYPE(int, XPU_SUCCESS); +#ifdef PADDLE_WITH_XPU_BKCL DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS); +#endif #undef DEFINE_EXTERNAL_API_TYPE diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index c6d49bd5b978b..bfdc381482318 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -48,6 +48,7 @@ enum class Backend : uint8_t { XPU, // XPU currently does not exist at the same time as CUDA NPU, // NPU currently does not exist at the same time as CUDA MLU, // MLU currently does not exist at the same time as CUDA + IPU, // the third library backend ONEDNN, @@ -56,8 +57,6 @@ enum class Backend : uint8_t { // paddle kernel primitives backend KPS, - IPU, - // end of backend types NUM_BACKENDS, diff --git a/paddle/phi/common/layout.h b/paddle/phi/common/layout.h index 6b2f657699fb1..2d74abeb84d64 100644 --- a/paddle/phi/common/layout.h +++ b/paddle/phi/common/layout.h @@ -18,8 +18,20 @@ limitations under the License. */ namespace paddle { namespace experimental { +// Note: The original design of paddle DataLayout is confusing. +// It contains two levels of "layout", one is the data layout +// at the Tensor level, including Dense, Sparse, etc., and the other +// is the format at the data level, including NHWC, NCHW, etc., +// these should belong to the concept of "data format". +// The concepts of these two levels are mixed into an enumeration class, +// which leads to some strange execution scheduling logic. +// It needs to be refactored in the future. +// In order to maintain compatibility, we still use the design of the +// original framework here. + // Note: Here the DataLayout is public api for external users, the prefix `k` // maybe confuse users, so we use all uppercase names + enum class DataLayout { UNDEFINED = 0, // TODO(chenweihang): keep ANY for compatibility, remove it later @@ -32,16 +44,21 @@ enum class DataLayout { SPARSE_COO, SPARSE_CSR, PSTRING_UNION, + NUM_DATA_LAYOUTS, + // See Note [ Why we need ALL in basic kernel key member? ] ALL_LAYOUT = UNDEFINED, + // Note: Unify phi DataLayout and fluid::framework::DataLayout, // for compatible with fluid DataLayout, here need prefix `k` + // Note: The original `kAnyLayout (enum value 2)` is a strange design. // `kAnyLayout` originally cannot represent any kind of Layout, // at the same time, it can also represent any Layout. // Strictly, it means "default" or "undefined" layout, - // and should not be mixed with other meaningful layouts. + // and should not be mixed with other meaningful layouts + kAnyLayout = ANY, kNHWC = NHWC, kNCHW = NCHW, diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index ae3b8924ece69..f4cca91a562da 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -51,6 +51,7 @@ const std::unordered_set deprecated_op_names({"diag", "squeeze_grad", "isfinite", "matmul", + "fill", "matmul_grad", "matmul_grad_grad", "max", diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h index 09098705b11e4..d16a019c7ab0d 100644 --- a/paddle/phi/core/dense_tensor.h +++ b/paddle/phi/core/dense_tensor.h @@ -22,7 +22,7 @@ limitations under the License. */ /* @jim19930609: Move to MKLDNN_Tensor in the future */ #ifdef PADDLE_WITH_MKLDNN -#include "dnnl.hpp" +#include "dnnl.hpp" // NOLINT #endif namespace phi { @@ -174,6 +174,36 @@ class DenseTensor : public TensorBase, /* Temporarily put InplaceVersion inside DenseTensor. Will move to AutogradMeta as soon as we switch to Eager Dygraph. */ + /* + NOTE(liym27): [ What is TensorInplaceVersion used for? ] + + TensorInplaceVersion is a version counter and every Tensor has a version + counter. It's used to check whether an inplace operation will result in an + incorrect gradient calculation. Version is incremented when the data of the + Variable is modified in place. + + - Question: In what scenarios will version counters be shared? + - Answer: When two Variables/VarBases share the same C++ Tensor(its Allocation + may change), both of them share the same version counter. For examples: + 1. `z = paddle.assign(input=x, output=y)`, `z` shares the same version + counter of `y` because z and y is the same VarBase; + 2. `y = x.detach()`, `y` shares the same version counter of `x`. + + - Question: In what scenarios will version counters NOT be shared? + - Answer: Replacing a `Variable`'s data by calling + `Tensor::ShareDataWith(...)` or `Tensor::ShareBufferWith(...)`. Because they + share the same Allocation but not framework::Tensor. + + - Question: Why put the inplace_version_counter_ in framework::Tensor instead + of Allocation or Variable? + - Answer: + 1. Tensor can call ResetHolder() to reset the corresponding Allocation so + that the inplace_version_counter_ changes if it's in Allocation, which will + lead to confusing information about inplace version. + 2. If inplace_version_counter_ is in Variable, different VariableWrappers + should be able to share the same Variable. However, a VariableWrapper hold a + Variable object but not a pointer. + */ class InplaceVersion { public: bool IsUnique() const { return inplace_version_ == 0; } diff --git a/paddle/phi/core/enforce.cc b/paddle/phi/core/enforce.cc index 8074fbeb49180..4eb580955a97c 100644 --- a/paddle/phi/core/enforce.cc +++ b/paddle/phi/core/enforce.cc @@ -27,6 +27,7 @@ class EagerVariable; } namespace paddle { namespace framework { +class VarDesc; class BlockDesc; using Attribute = paddle::variant, std::vector, - std::vector>; + std::vector, + VarDesc*, + std::vector>; using AttributeMap = std::unordered_map; } // namespace framework namespace imperative { diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc index 8b3d4a1427340..dcd25180e2997 100644 --- a/paddle/phi/core/tensor_utils.cc +++ b/paddle/phi/core/tensor_utils.cc @@ -35,6 +35,19 @@ void Copy(const Context& dev_ctx, auto* src_ptr = src.data(); const auto& src_place = src.place(); + if (&src == dst) { + if (paddle::platform::is_same_place(src_place, dst_place)) { + VLOG(6) << "Skip copy the same data(" << src_ptr << ") from " << src_place + << " to " << dst_place; + } else { + VLOG(6) << "Src and dst are the same Tensor, in-place copy data(" + << src_ptr << ") from " << src_place << " to " << dst_place; + const DenseTensor src_copy = src; + Copy(dev_ctx, src_copy, dst_place, blocking, dst); + } + return; + } + VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " << dst_place; diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h index 975d55889c717..9877149dc52bd 100644 --- a/paddle/phi/core/utils/data_type.h +++ b/paddle/phi/core/utils/data_type.h @@ -97,4 +97,18 @@ inline DataType ToComplexType(const DataType& type) { type)); } } + +inline DataType ToRealType(const DataType& type) { + switch (type) { + case DataType::COMPLEX64: + return DataType::FLOAT32; + case DataType::COMPLEX128: + return DataType::FLOAT64; + default: + PADDLE_THROW(errors::Unimplemented( + "Can not transform data type (%s) to real type, now only support " + "complex64 and complex128 value.", + type)); + } +} } // namespace phi diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h index bd972c8ceedc7..77fa2a3decce3 100644 --- a/paddle/phi/core/visit_type.h +++ b/paddle/phi/core/visit_type.h @@ -87,6 +87,20 @@ namespace phi { } \ }() +#define PD_VISIT_BASE_INTEGRAL_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE( \ + NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + __dtype__, \ + "`"); \ + } \ + }() + ///////// Complex Dispatch Marco /////////// #define PD_VISIT_COMPLEX_TYPES(TYPE, NAME, ...) \ diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index 87640cdddbfc9..83cf1a713dc55 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/phi/infermeta/backward.h" #include "paddle/phi/common/type_traits.h" +#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/funcs/axis_utils.h" namespace phi { @@ -285,6 +286,47 @@ void EigvalshGradInferMeta(const MetaTensor& out_v, } } +void FFTC2RGradInferMeta(const MetaTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + int64_t last_dim_size, + MetaTensor* out, + MetaConfig config) { + PADDLE_ENFORCE_NOT_NULL(out, + phi::errors::InvalidArgument( + "Output of fft_c2r _grad should not be null.")); + const phi::DDim x_dim = x.dims(); + + // only ensure that fft axes' size greater than zero at runtime + // they might be -1 to indicate unknown size ar compile time + if (config.is_runtime) { + for (size_t i = 0; i < axes.size(); i++) { + PADDLE_ENFORCE_GT(x_dim[axes[i]], + 0, + phi::errors::InvalidArgument( + "Invalid fft n-point (%d).", x_dim[axes[i]])); + } + } + + out->set_layout(x.layout()); + out->set_dtype(ToComplexType(x.dtype())); + + phi::DDim out_dim = x.dims(); + const int64_t last_fft_axis = axes.back(); + if (last_dim_size > 0) { + out_dim.at(last_fft_axis) = last_dim_size / 2 + 1; + } else if (config.is_runtime) { + const int64_t last_fft_dim_size = x_dim[last_fft_axis]; + out_dim.at(last_fft_axis) = last_fft_dim_size / 2 + 1; + } else { + const int64_t last_fft_dim_size = x_dim[last_fft_axis]; + out_dim.at(last_fft_axis) = + last_fft_dim_size == -1 ? -1 : last_fft_dim_size / 2 + 1; + } + out->set_dims(out_dim); +} + void FillDiagonalGradInferMeta(const MetaTensor& dout, float value, int offset, @@ -297,6 +339,17 @@ void FillDiagonalGradInferMeta(const MetaTensor& dout, } } +void FillDiagonalTensorGradInferMeta(const MetaTensor& out_grad, + int64_t offset, + int dim1, + int dim2, + MetaTensor* x_grad) { + if (x_grad != nullptr) { + x_grad->set_dims(out_grad.dims()); + x_grad->set_dtype(out_grad.dtype()); + } +} + void GatherNdGradInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& out_grad, @@ -507,6 +560,30 @@ void LUUnpackGradInferMeta(const MetaTensor& x, } } +void MarginCrossEntropyGradInferMeta(const MetaTensor& logits, + const MetaTensor& label, + const MetaTensor& softmax, + const MetaTensor& loss_grad, + bool return_softmax, + int ring_id, + int rank, + int nranks, + float margin1, + float margin2, + float margin3, + float scale, + MetaTensor* logits_grad) { + PADDLE_ENFORCE_NE( + logits_grad, + nullptr, + phi::errors::InvalidArgument( + "The Logits@GRAD in MarginCrossEntropy can't be nullptr.")); + auto softmax_dims = softmax.dims(); + + logits_grad->set_dims(softmax_dims); + logits_grad->set_dtype(softmax.dtype()); +} + void MaxPoolWithIndexGradInferMeta(const MetaTensor& x, const MetaTensor& mask, const MetaTensor& dout, diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h old mode 100755 new mode 100644 index 1ada2c8015794..36edb0e56bafd --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -137,9 +137,23 @@ void EigvalshGradInferMeta(const MetaTensor& out_v, bool is_test, MetaTensor* x_grad); +void FFTC2RGradInferMeta(const MetaTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + int64_t last_dim_size, + MetaTensor* out, + MetaConfig = MetaConfig()); + void FillDiagonalGradInferMeta( const MetaTensor& dout, float value, int offset, bool wrap, MetaTensor* dx); +void FillDiagonalTensorGradInferMeta(const MetaTensor& out_grad, + int64_t offset, + int dim1, + int dim2, + MetaTensor* x_grad); + void GatherNdGradInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& out_grad, @@ -231,6 +245,20 @@ void LUUnpackGradInferMeta(const MetaTensor& x, bool unpack_pivots, MetaTensor* x_grad); +void MarginCrossEntropyGradInferMeta(const MetaTensor& logits, + const MetaTensor& label, + const MetaTensor& softmax, + const MetaTensor& loss_grad, + bool return_softmax, + int ring_id, + int rank, + int nranks, + float margin1, + float margin2, + float margin3, + float scale, + MetaTensor* logits_grad); + void MaxPoolWithIndexGradInferMeta(const MetaTensor& x, const MetaTensor& mask, const MetaTensor& dout, diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 8ba4290e69fb1..44e53fc32ccff 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -953,7 +953,7 @@ void DistInferMeta(const MetaTensor& x, void DropoutInferMeta(const MetaTensor& x, const MetaTensor& seed_tensor, - float p, + const Scalar& p, bool is_test, const std::string& mode, int seed, @@ -973,7 +973,7 @@ void DropoutInferMeta(const MetaTensor& x, void DropoutNdInferMeta(const MetaTensor& x, const MetaTensor& seed_tensor, - float p, + const Scalar& p, bool is_test, const std::string& mode, int seed, @@ -1174,6 +1174,19 @@ void ExpandAsInferMeta(const MetaTensor& x, #undef MAX_RANK_SUPPORTED } +void FillDiagonalTensorInferMeta(const MetaTensor& x, + const MetaTensor& y, + int64_t offset, + int dim1, + int dim2, + MetaTensor* out) { + PADDLE_ENFORCE_NOT_NULL(out, + phi::errors::InvalidArgument( + "Output Tensor (out) should not be nullptr.")); + out->set_dims(x.dims()); + out->set_dtype(x.dtype()); +} + void GatherInferMeta(const MetaTensor& x, const MetaTensor& index, const Scalar& axis, @@ -1532,6 +1545,65 @@ void LUUnpackInferMeta(const MetaTensor& x, } } +void MarginCrossEntropyInferMeta(const MetaTensor& logits, + const MetaTensor& label, + bool return_softmax, + int ring_id, + int rank, + int nranks, + float margin1, + float margin2, + float margin3, + float scale, + MetaTensor* softmax, + MetaTensor* loss, + MetaConfig config) { + PADDLE_ENFORCE_NOT_NULL( + logits, + phi::errors::InvalidArgument("Input of logits should not be null.")); + PADDLE_ENFORCE_NOT_NULL( + label, + phi::errors::InvalidArgument("Input of label should not be null.")); + auto logits_dims = logits.dims(); + auto labels_dims = label.dims(); + + auto logits_rank = logits_dims.size(); + auto axis = logits_rank - 1; + for (int i = 0; i < logits_rank; i++) { + if (i != axis) { + if (config.is_runtime || (logits_dims[i] > 0 && labels_dims[i] > 0)) { + PADDLE_ENFORCE_EQ(logits_dims[i], + labels_dims[i], + phi::errors::InvalidArgument( + "Input(Logits) and Input(Label) should in " + "same shape in dimensions except axis.")); + } + } + } + + if (labels_dims.size() > 1) { + PADDLE_ENFORCE_EQ( + labels_dims[logits_rank - 1], + 1UL, + phi::errors::InvalidArgument( + "the last dimension of Input(Label) should be 1." + "But received: the last dimension of Input(Label) is [%d]," + "the last dimension is [%d]", + labels_dims[logits_rank - 1], + logits_rank - 1)); + } + + softmax->set_dims(logits_dims); + softmax->set_dtype(logits.dtype()); + + logits_dims[axis] = 1; + loss->set_dims(logits_dims); + loss->set_dtype(logits.dtype()); + + softmax->share_lod(logits); + loss->share_lod(logits); +} + void MaskedSelectInferMeta(const MetaTensor& x, const MetaTensor& mask, MetaTensor* out) { @@ -2554,6 +2626,89 @@ void SolveInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { out->share_lod(x); } +void UnpoolInferMeta(const MetaTensor& x, + const MetaTensor& indices, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_size, + const std::string& data_format, + MetaTensor* out, + MetaConfig config) { + auto in_x_dims = x.dims(); + auto in_y_dims = indices.dims(); + + PADDLE_ENFORCE_EQ(in_x_dims.size() == 4, + true, + phi::errors::InvalidArgument( + "Unpool Intput(X) must be of 4-dimensional, but " + "received Input(X)'s dimensions is %d.", + in_x_dims.size())); + PADDLE_ENFORCE_EQ(in_x_dims, + in_y_dims, + phi::errors::InvalidArgument( + "The dimensions of Input(X) must equal to be" + "the dimensions of Input(Indices), but received" + "dimensions of Input(X) is [%d], received dimensions" + "of Input(Indices) is [%d]", + in_x_dims, + in_y_dims)); + + std::vector output_shape({in_x_dims[0], in_x_dims[1]}); + for (size_t i = 0; i < ksize.size(); ++i) { + if (!config.is_runtime && in_x_dims[i + 2] <= 0) { + output_shape.push_back(-1); + } else { + output_shape.push_back(output_size[i]); + } + } + if (out != nullptr) { + out->set_dims(phi::make_ddim(output_shape)); + out->set_dtype(x.dtype()); + } +} +void Unpool3dInferMeta(const MetaTensor& x, + const MetaTensor& indices, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_size, + const std::string& data_format, + MetaTensor* out, + MetaConfig config) { + auto in_x_dims = x.dims(); + auto in_y_dims = indices.dims(); + + PADDLE_ENFORCE_EQ(in_x_dims.size() == 5, + true, + phi::errors::InvalidArgument( + "Unpool Intput(X) must be of 5-dimensional, but " + "received Input(X)'s dimensions is %d.", + in_x_dims.size())); + PADDLE_ENFORCE_EQ(in_x_dims, + in_y_dims, + phi::errors::InvalidArgument( + "The dimensions of Input(X) must equal to be" + "the dimensions of Input(Indices), but received" + "dimensions of Input(X) is [%d], received dimensions" + "of Input(Indices) is [%d]", + in_x_dims, + in_y_dims)); + + std::vector output_shape({in_x_dims[0], in_x_dims[1]}); + for (size_t i = 0; i < ksize.size(); ++i) { + if (!config.is_runtime && in_x_dims[i + 2] <= 0) { + output_shape.push_back(-1); + } else { + output_shape.push_back(output_size[i]); + } + } + if (out != nullptr) { + out->set_dims(phi::make_ddim(output_shape)); + out->set_dtype(x.dtype()); + } +} + } // namespace phi PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta); diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 9f548256f4fa0..7dcbe33e0a933 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -143,7 +143,7 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); void DropoutInferMeta(const MetaTensor& x, const MetaTensor& seed_tensor, - float p, + const Scalar& p, bool is_test, const std::string& mode, int seed, @@ -153,7 +153,7 @@ void DropoutInferMeta(const MetaTensor& x, void DropoutNdInferMeta(const MetaTensor& x, const MetaTensor& seed_tensor, - float p, + const Scalar& p, bool is_test, const std::string& mode, int seed, @@ -182,6 +182,13 @@ void ExpandAsInferMeta(const MetaTensor& x, const std::vector& target_shape, MetaTensor* out); +void FillDiagonalTensorInferMeta(const MetaTensor& x, + const MetaTensor& y, + int64_t offset, + int dim1, + int dim2, + MetaTensor* out); + void GatherInferMeta(const MetaTensor& x, const MetaTensor& index, const Scalar& axis, @@ -233,6 +240,20 @@ void LUUnpackInferMeta(const MetaTensor& x, MetaTensor* l, MetaTensor* u); +void MarginCrossEntropyInferMeta(const MetaTensor& logits, + const MetaTensor& label, + bool return_softmax, + int ring_id, + int rank, + int nranks, + float margin1, + float margin2, + float margin3, + float scale, + MetaTensor* softmax, + MetaTensor* loss, + MetaConfig config = MetaConfig()); + void MaskedSelectInferMeta(const MetaTensor& x, const MetaTensor& mask, MetaTensor* out); @@ -360,4 +381,24 @@ void ValueCompareInferMeta(const MetaTensor& x, void SolveInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); +void UnpoolInferMeta(const MetaTensor& x, + const MetaTensor& indices, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_size, + const std::string& data_format, + MetaTensor* out, + MetaConfig config = MetaConfig()); + +void Unpool3dInferMeta(const MetaTensor& x, + const MetaTensor& indices, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_size, + const std::string& data_format, + MetaTensor* out, + MetaConfig config = MetaConfig()); + } // namespace phi diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 70177c05f0bc2..6e4f2dce35f96 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -1090,6 +1090,24 @@ void EditDistanceInferMeta(const MetaTensor& hyps, sequencenum->set_dtype(DataType::FLOAT32); } +void GenerateProposalsV2InferMeta(const MetaTensor& scores, + const MetaTensor& bbox_deltas, + const MetaTensor& im_shape, + const MetaTensor& anchors, + const MetaTensor& variances, + int pre_nms_top_n, + int post_nms_top_n, + float nms_thresh, + float min_size, + float eta, + bool pixel_offset, + MetaTensor* rpn_rois, + MetaTensor* rpn_roi_probs, + MetaTensor* rpn_rois_num) { + rpn_rois->set_dims(phi::make_ddim({-1, 4})); + rpn_roi_probs->set_dims(phi::make_ddim({-1, 1})); +} + void HierarchicalSigmoidInferMeta(const MetaTensor& x, const MetaTensor& w, const MetaTensor& label, @@ -1624,6 +1642,105 @@ void InterpolateInferMeta( } } +void LambInferMeta(const MetaTensor& param, + const MetaTensor& grad, + const MetaTensor& learning_rate, + const MetaTensor& moment1, + const MetaTensor& moment2, + const MetaTensor& beta1_pow, + const MetaTensor& beta2_pow, + const MetaTensor& master_param, + const MetaTensor& skip_update, + float weight_decay, + float beta1, + float beta2, + float epsilon, + bool multi_precision, + MetaTensor* param_out, + MetaTensor* moment1_out, + MetaTensor* moment2_out, + MetaTensor* beta1_pow_out, + MetaTensor* beta2_pow_out, + MetaTensor* master_param_outs) { + auto lr_dims = learning_rate.dims(); + PADDLE_ENFORCE_NE( + phi::product(lr_dims), + 0, + phi::errors::InvalidArgument( + "The number of LearningRate shall not be 0, but received %d. Maybe " + "the Input variable LearningRate has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function.", + phi::product(lr_dims))); + PADDLE_ENFORCE_EQ( + phi::product(lr_dims), + 1, + phi::errors::InvalidArgument( + "Learning rate should have 1 dimension, but received %d.", + phi::product(lr_dims))); + auto beta1_pow_dims = beta1_pow.dims(); + PADDLE_ENFORCE_GE(phi::product(beta1_pow_dims), + 1, + phi::errors::InvalidArgument( + "The size of Beta1 power accumulator should be " + "greater than 0, but received %d.", + phi::product(beta1_pow_dims))); + auto beta2_pow_dims = beta2_pow.dims(); + PADDLE_ENFORCE_GE(phi::product(beta2_pow_dims), + 1, + phi::errors::InvalidArgument( + "The size of Beta2 power accumulator should be " + "greater than 0, but received %d.", + phi::product(beta2_pow_dims))); + + auto param_dims = param.dims(); + PADDLE_ENFORCE_EQ( + param_dims, + moment1.dims(), + phi::errors::InvalidArgument( + "Param and Moment1 input of LambOp should have same dimension. But " + "received Param dims: [%s], Moment1 dims: [%s].", + param_dims, + moment1.dims())); + PADDLE_ENFORCE_EQ( + param_dims, + moment2.dims(), + errors::InvalidArgument( + "Param and Moment2 input of AdamOp should have same dimension. But " + "received Param dims: [%s], Moment2 dims: [%s].", + param_dims, + moment2.dims())); + + PADDLE_ENFORCE_NOT_NULL( + param_out, errors::NotFound("The output param_out can not be nullptr")); + PADDLE_ENFORCE_NOT_NULL( + moment1_out, + errors::NotFound("The output moment1_out can not be nullptr")); + PADDLE_ENFORCE_NOT_NULL( + moment2_out, + errors::NotFound("The output moment2_out can not be nullptr")); + PADDLE_ENFORCE_NOT_NULL( + beta1_pow_out, + errors::NotFound("The output beta1_pow_out can not be nullptr")); + PADDLE_ENFORCE_NOT_NULL( + beta2_pow_out, + errors::NotFound("The output beta2_pow_out can not be nullptr")); + + param_out->set_dims(param_dims); + param_out->set_dtype(param.dtype()); + + moment1_out->set_dims(param_dims); + moment1_out->set_dtype(moment1.dtype()); + moment2_out->set_dims(param_dims); + moment2_out->set_dtype(moment2.dtype()); + + beta1_pow_out->set_dims(beta1_pow_dims); + beta1_pow_out->set_dtype(beta1_pow.dtype()); + beta2_pow_out->set_dims(beta2_pow_dims); + beta2_pow_out->set_dtype(beta2_pow.dtype()); +} + void LogspaceInferMeta(const MetaTensor& start, const MetaTensor& stop, const MetaTensor& number, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index af9fea2d3ce87..472d665050bde 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -221,6 +221,21 @@ void EditDistanceInferMeta(const MetaTensor& hyps, MetaTensor* sequencenum, MetaTensor* out); +void GenerateProposalsV2InferMeta(const MetaTensor& scores, + const MetaTensor& bbox_deltas, + const MetaTensor& im_shape, + const MetaTensor& anchors, + const MetaTensor& variances, + int pre_nms_top_n, + int post_nms_top_n, + float nms_thresh, + float min_size, + float eta, + bool pixel_offset, + MetaTensor* rpn_rois, + MetaTensor* rpn_roi_probs, + MetaTensor* rpn_rois_num); + void HierarchicalSigmoidInferMeta(const MetaTensor& x, const MetaTensor& w, const MetaTensor& label, @@ -254,6 +269,27 @@ void InterpolateInferMeta( MetaTensor* output, MetaConfig config = MetaConfig()); +void LambInferMeta(const MetaTensor& param, + const MetaTensor& grad, + const MetaTensor& learning_rate, + const MetaTensor& moment1, + const MetaTensor& moment2, + const MetaTensor& beta1_pow, + const MetaTensor& beta2_pow, + const MetaTensor& master_param, + const MetaTensor& skip_update, + float weight_decay, + float beta1, + float beta2, + float epsilon, + bool multi_precision, + MetaTensor* param_out, + MetaTensor* moment1_out, + MetaTensor* moment2_out, + MetaTensor* beta1_pow_out, + MetaTensor* beta2_pow_out, + MetaTensor* master_param_outs); + void LogspaceInferMeta(const MetaTensor& start, const MetaTensor& stop, const MetaTensor& number, diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 3ee42b86d6e3e..a919a955a541a 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -412,7 +412,7 @@ void GraphSendRecvInferMeta(const MetaTensor& x, const MetaTensor& src_index, const MetaTensor& dst_index, const std::string& pool_type, - int64_t out_size, + const IntArray& out_size, MetaTensor* out, MetaTensor* dst_count) { auto src_index_dims = src_index.dims(); @@ -455,23 +455,13 @@ void GraphSendRecvInferMeta(const MetaTensor& x, "Src_index and Dst_index should have the same shape.")); auto dims = x.dims(); - if (out_size <= 0) { - out->set_dims(dims); - } else { - std::vector dims_ = phi::vectorize(dims); - if (dims_.size() > 0) { - dims_[0] = out_size; - } - out->set_dims(phi::make_ddim(dims_)); - } + std::vector dims_ = phi::vectorize(dims); + dims_[0] = -1; + out->set_dims(phi::make_ddim(dims_)); out->set_dtype(x.dtype()); if (pool_type == "MEAN") { - if (out_size <= 0) { - dst_count->set_dims({dims[0]}); - } else { - dst_count->set_dims({out_size}); - } + dst_count->set_dims({-1}); dst_count->set_dtype(DataType::INT32); } } diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 55a63b1c957c4..466bd3df5de2d 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/meta_tensor.h" namespace phi { @@ -75,7 +76,7 @@ void GraphSendRecvInferMeta(const MetaTensor& x, const MetaTensor& src_index, const MetaTensor& dst_index, const std::string& pool_type, - int64_t out_size, + const IntArray& out_size, MetaTensor* out, MetaTensor* dst_count); diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 74705c3759da3..7da162cd0b046 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -309,6 +309,35 @@ void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out) { out->set_dtype(x.dtype()); } +void ClassCenterSampleInferMeta(const MetaTensor& label, + int num_classes, + int num_samples, + int ring_id, + int rank, + int nranks, + bool fix_seed, + int seed, + MetaTensor* remapped_label, + MetaTensor* sampled_local_class_center) { + PADDLE_ENFORCE_EQ( + label.dims().size(), + 1, + errors::InvalidArgument("Rank of Input(Label) should be equal to 1, " + "but the value given is %d.", + label.dims().size())); + PADDLE_ENFORCE_NOT_NULL(remapped_label, + phi::errors::InvalidArgument( + "output of remapped label should not be null.")); + PADDLE_ENFORCE_NOT_NULL( + sampled_local_class_center, + phi::errors::InvalidArgument( + "output of sampled local class center should not be null.")); + remapped_label->set_dims(label.dims()); + remapped_label->set_dtype(label.dtype()); + sampled_local_class_center->set_dims(phi::make_ddim({num_samples})); + sampled_local_class_center->set_dtype(label.dtype()); +} + void ClipByNormInferMeta(const MetaTensor& x, float max_norm, MetaTensor* out) { PADDLE_ENFORCE_GT( max_norm, @@ -866,6 +895,112 @@ void FillDiagonalInferMeta( out->set_dtype(x.dtype()); } +void FFTC2CInferMeta(const MetaTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + MetaTensor* out, + MetaConfig config) { + PADDLE_ENFORCE_NOT_NULL( + out, + phi::errors::InvalidArgument("Output of fft_c2c should not be null.")); + // only ensure that fft axes' size greater than zero at runtime + // they might be -1 to indicate unknown size ar compile time + if (config.is_runtime) { + const phi::DDim x_dim = x.dims(); + for (size_t i = 0; i < axes.size(); i++) { + PADDLE_ENFORCE_GT(x_dim[axes[i]], + 0, + phi::errors::InvalidArgument( + "Invalid fft n-point (%d).", x_dim[axes[i]])); + } + } + out->share_meta(x); +} + +void FFTC2RInferMeta(const MetaTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + int64_t last_dim_size, + MetaTensor* out, + MetaConfig config) { + PADDLE_ENFORCE_NOT_NULL( + out, + phi::errors::InvalidArgument("Output of fft_c2r should not be null.")); + const phi::DDim x_dim = x.dims(); + const int64_t last_fft_axis = axes.back(); + + // only ensure that fft axes' size greater than zero at runtime + // they might be -1 to indicate unknown size ar compile time + if (config.is_runtime) { + size_t signal_dims = axes.size(); + for (size_t i = 0; i < signal_dims - 1; i++) { + PADDLE_ENFORCE_GT(x_dim[axes[i]], + 0, + phi::errors::InvalidArgument( + "Invalid fft n-point (%d).", x_dim[axes[i]])); + } + } + + out->set_layout(x.layout()); + out->set_dtype(ToRealType(x.dtype())); + phi::DDim out_dim = x_dim; + + if (last_dim_size > 0) { + out_dim.at(last_fft_axis) = last_dim_size; + } else if (config.is_runtime) { + const int64_t input_last_dim_size = x_dim[last_fft_axis]; + const int64_t fft_n_point = (input_last_dim_size - 1) * 2; + PADDLE_ENFORCE_GT( + fft_n_point, + 0, + phi::errors::InvalidArgument("Invalid fft n-point (%d).", fft_n_point)); + out_dim.at(last_fft_axis) = fft_n_point; + } else { + const int64_t input_last_dim_size = x_dim[last_fft_axis]; + out_dim.at(last_fft_axis) = + input_last_dim_size == -1 ? -1 : (input_last_dim_size - 1) * 2; + } + out->set_dims(out_dim); +} + +void FFTR2CInferMeta(const MetaTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + bool onesided, + MetaTensor* out, + MetaConfig config) { + PADDLE_ENFORCE_NOT_NULL( + out, + phi::errors::InvalidArgument("Output of fft_r2c should not be null.")); + const phi::DDim x_dim = x.dims(); + + // only ensure that fft axes' size greater than zero at runtime + // they might be -1 to indicate unknown size ar compile time + if (config.is_runtime) { + for (size_t i = 0; i < axes.size(); i++) { + PADDLE_ENFORCE_GT(x_dim[axes[i]], + 0, + phi::errors::InvalidArgument( + "Invalid fft n-point (%d).", x_dim[axes[i]])); + } + } + + out->set_layout(x.layout()); + out->set_dtype(ToComplexType(x.dtype())); + if (!onesided) { + out->share_dims(x); + } else { + phi::DDim out_dim = x.dims(); + const int64_t last_fft_axis = axes.back(); + const int64_t last_fft_dim_size = x_dim[last_fft_axis]; + out_dim.at(last_fft_axis) = last_fft_dim_size / 2 + 1; + out->set_dims(out_dim); + } +} + void FlattenInferMeta(const MetaTensor& x, int start_axis, int stop_axis, @@ -2480,6 +2615,9 @@ void ReduceInferMeta(const MetaTensor& x, bool keep_dim, MetaTensor* out) { bool reduce_all = false; + if (axis.size() == 0) { + reduce_all = true; + } ReduceInferMetaBase(x, axis, keep_dim, reduce_all, out); } @@ -2841,7 +2979,7 @@ void SplitInferMeta(const MetaTensor& x, // step1: get formated sections std::vector sections; // num_or_sections is a number - if (num_or_sections_data.size() == 1) { + if (num_or_sections_data.size() == 1 && num_or_sections_data[0] > 0) { int num = num_or_sections_data.at(0); PADDLE_ENFORCE_EQ(input_axis_dim % num, @@ -3148,6 +3286,9 @@ void SumInferMeta(const MetaTensor& x, bool keep_dim, MetaTensor* out) { bool reduce_all = false; + if (axis.size() == 0) { + reduce_all = true; + } SumRawInferMeta(x, axis, keep_dim, reduce_all, dtype, out); } @@ -4113,6 +4254,211 @@ void IdentityLossInferMeta(const MetaTensor& x, } } +void FoldInferMeta(const MetaTensor& x, + const std::vector& output_sizes, + const std::vector& kernel_sizes, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + MetaTensor* out) { + auto in_dims = x.dims(); + + PADDLE_ENFORCE_EQ( + output_sizes.size(), + 2, + phi::errors::InvalidArgument( + "It is expected output_size equals to 2, but got size %d", + output_sizes.size())); + PADDLE_ENFORCE_EQ( + kernel_sizes.size(), + 2, + phi::errors::InvalidArgument( + "It is expected kernel_size equals to 2, but got size %d", + kernel_sizes.size())); + PADDLE_ENFORCE_EQ( + strides.size(), + 2, + phi::errors::InvalidArgument( + "It is expected strides_size equals to 2, but got size %d", + strides.size())); + PADDLE_ENFORCE_EQ( + paddings.size(), + 4, + phi::errors::InvalidArgument( + "It is expected paddings_size equals to 4, but got size %d", + paddings.size())); + + PADDLE_ENFORCE_EQ( + dilations.size(), + 2, + phi::errors::InvalidArgument( + "It is expected dilations_size equals to 2, but got size %d", + dilations.size())); + + int output_height = output_sizes[0]; + int output_width = output_sizes[1]; + int kernel_height = kernel_sizes[0]; + int kernel_width = kernel_sizes[1]; + int dilation_height = dilations[0]; + int dilation_width = dilations[1]; + int stride_height = strides[0]; + int stride_width = strides[1]; + + // check kernel_sizes + PADDLE_ENFORCE_GT(kernel_height, + 0, + phi::errors::InvalidArgument( + "The `kernel_sizes` should be greater than zero, " + "but received kernel_height: %d kernel_width: %d.", + kernel_sizes[0], + kernel_sizes[1])); + PADDLE_ENFORCE_GT(kernel_width, + 0, + phi::errors::InvalidArgument( + "The `kernel_sizes` should be greater than zero, " + "but received kernel_height: %d kernel_width: %d.", + kernel_sizes[0], + kernel_sizes[1])); + // check strides + PADDLE_ENFORCE_GT(stride_height, + 0, + phi::errors::InvalidArgument( + "The `strides` should be greater than zero, " + "but received strides_height: %d strides_width: %d.", + strides[0], + strides[1])); + PADDLE_ENFORCE_GT(stride_width, + 0, + phi::errors::InvalidArgument( + "The `strides` should be greater than zero, " + "but received strides_height: %d strides_width: %d.", + strides[0], + strides[1])); + // check dilations + PADDLE_ENFORCE_GT(output_height, + 1, + phi::errors::InvalidArgument( + "The `output_height` should be greater than one, " + "but received output_height: %d .", + output_height)); + PADDLE_ENFORCE_GT(output_width, + 1, + phi::errors::InvalidArgument( + "The `output_width` should be greater than one, " + "but received output_width: %d .", + output_width)); + // check output size + PADDLE_ENFORCE_GT( + dilation_height, + 0, + phi::errors::InvalidArgument( + "The `dilations` should be greater than zero, " + "but received dilations_height: %d dilations_width: %d.", + dilations[0], + dilations[1])); + PADDLE_ENFORCE_GT( + dilation_width, + 0, + phi::errors::InvalidArgument( + "The `dilations` should be greater than zero, " + "but received dilations_height: %d dilations_width: %d.", + dilations[0], + dilations[1])); + + std::vector out_dims; + // batch_size + out_dims.push_back(in_dims[0]); + // output_plane + int output_channels = in_dims[1] / (kernel_width * kernel_height); + out_dims.push_back(output_channels); + + int blocks_height = (output_sizes[0] + 2 * paddings[0] - + (dilations[0] * (kernel_sizes[0] - 1) + 1)) / + strides[0] + + 1; + int blocks_width = (output_sizes[1] + 2 * paddings[1] - + (dilations[1] * (kernel_sizes[1] - 1) + 1)) / + strides[1] + + 1; + + // check output height and width + PADDLE_ENFORCE_GT( + blocks_height, + 0, + phi::errors::InvalidArgument( + "The sliding blocks calculated from input spatial size (%d, %d), " + "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " + "is (%d, %d), which should be a positive integer.", + in_dims[2], + in_dims[3], + kernel_sizes[0], + kernel_sizes[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + output_height, + output_width)); + + PADDLE_ENFORCE_GT( + blocks_width, + 0, + phi::errors::InvalidArgument( + "The sliding blocks calculated from input spatial size (%d, %d), " + "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " + "is (%d, %d), which should be a positive integer.", + in_dims[2], + in_dims[3], + kernel_sizes[0], + kernel_sizes[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + output_height, + output_width)); + + PADDLE_ENFORCE_EQ( + blocks_height * blocks_width, + in_dims[2], + phi::errors::InvalidArgument( + "Given input output_size (%d, %d), " + "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " + "which should be expected size of input's dimension " + "2 to match the calculated number of %d * %d = %d, but got %d", + output_height, + output_width, + kernel_sizes[0], + kernel_sizes[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + blocks_height, + blocks_width, + blocks_height * blocks_width, + in_dims[2])); + + PADDLE_ENFORCE_EQ( + in_dims[1] % (kernel_sizes[0] * kernel_sizes[1]), + 0, + phi::errors::InvalidArgument( + "Expected size of input's dimension 1 to be divisible by the" + "product of kernel_size, but got input.size(1)=%d and " + "kernel_size=( %d" + ", %d).", + in_dims[1], + kernel_sizes[0], + kernel_sizes[1])); + + out_dims.push_back(output_height); + out_dims.push_back(output_width); + if (out != nullptr) { + out->set_dims(phi::make_ddim(out_dims)); + out->set_dtype(x.dtype()); + } +} + } // namespace phi PD_REGISTER_INFER_META_FN(flatten, phi::FlattenInferMeta); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index bd35855a43129..d81c8ea7a4391 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -67,6 +67,17 @@ void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out); void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out); +void ClassCenterSampleInferMeta(const MetaTensor& label, + int num_classes, + int num_samples, + int ring_id, + int rank, + int nranks, + bool fix_seed, + int seed, + MetaTensor* remapped_label, + MetaTensor* sampled_local_class_center); + void ClipByNormInferMeta(const MetaTensor& x, float max_norm, MetaTensor* out); void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out); @@ -135,6 +146,29 @@ void ExpandInferMeta(const MetaTensor& x, void FillDiagonalInferMeta( const MetaTensor& x, float value, int offset, bool wrap, MetaTensor* out); +void FFTC2CInferMeta(const MetaTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + MetaTensor* out, + MetaConfig = MetaConfig()); + +void FFTC2RInferMeta(const MetaTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + int64_t last_dim_size, + MetaTensor* out, + MetaConfig = MetaConfig()); + +void FFTR2CInferMeta(const MetaTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + bool onesided, + MetaTensor* out, + MetaConfig = MetaConfig()); + void FlattenInferMeta(const MetaTensor& x, int start_axis, int stop_axis, @@ -583,4 +617,12 @@ void ChannelShuffleInferMeta(const MetaTensor& x, void IdentityLossInferMeta(const MetaTensor& x, int reduction, MetaTensor* out); +void FoldInferMeta(const MetaTensor& x, + const std::vector& output_sizes, + const std::vector& kernel_sizes, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + MetaTensor* out); + } // namespace phi diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 7b64658d571cf..ca98e43a50313 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -65,7 +65,8 @@ set(COMMON_KERNEL_DEPS matrix_solve phi_dynload_warpctc sequence_padding - sequence_scale) + sequence_scale + fft) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} @@ -88,17 +89,6 @@ copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h") file(GLOB kernel_impl_h "impl/*.h" "selected_rows/impl/*.h") file(GLOB kernel_primitive_h "primitive/*.h") -file( - GLOB - kernel_cc - "*.cc" - "cpu/*.cc" - "selected_rows/*.cc" - "selected_rows/cpu/*.cc" - "sparse/*.cc" - "sparse/cpu/*.cc" - "strings/*.cc" - "strings/cpu/*.cc") file( GLOB @@ -112,10 +102,34 @@ file( "strings/*.cu" "strings/gpu/*.cu") -# file(GLOB kernel_cudnn "gpudnn/*.cu") -# file(GLOB kernel_kps "kps/*.cu") +if(WITH_MKLDNN) + file( + GLOB + kernel_cc + "*.cc" + "cpu/*.cc" + "selected_rows/*.cc" + "selected_rows/cpu/*.cc" + "sparse/*.cc" + "sparse/cpu/*.cc" + "strings/*.cc" + "strings/cpu/*.cc" + "onednn/*.cc") +else() + file( + GLOB + kernel_cc + "*.cc" + "cpu/*.cc" + "selected_rows/*.cc" + "selected_rows/cpu/*.cc" + "sparse/*.cc" + "sparse/cpu/*.cc" + "strings/*.cc" + "strings/cpu/*.cc") +endif() + file(GLOB kernel_xpu "xpu/*.cc") -file(GLOB kernel_onednn "onednn/*.cc") add_library(phi_cpu ${kernel_cc}) kernel_declare("${kernel_cc}") @@ -155,12 +169,4 @@ if(WITH_XPU) set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_xpu) endif() -if(WITH_MKLDNN) - add_library(phi_onednn ${kernel_onednn}) - kernel_declare(${kernel_onednn}) - set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_backends) - target_link_libraries(phi_onednn ${COMMON_KERNEL_DEPS}) - set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_onednn) -endif() - set_property(GLOBAL PROPERTY PHI_KERNELS ${ADD_PHI_KERNELS}) diff --git a/paddle/phi/kernels/affine_grid_grad_kernel.h b/paddle/phi/kernels/affine_grid_grad_kernel.h index 061b763ed33f0..df2dfca10b0b3 100644 --- a/paddle/phi/kernels/affine_grid_grad_kernel.h +++ b/paddle/phi/kernels/affine_grid_grad_kernel.h @@ -15,7 +15,6 @@ #pragma once #include "paddle/phi/common/int_array.h" -#include "paddle/phi/kernels/affine_grid_impl.h" namespace phi { diff --git a/paddle/phi/kernels/affine_grid_kernel.h b/paddle/phi/kernels/affine_grid_kernel.h index febd5be0a3661..57ac0007b1c02 100644 --- a/paddle/phi/kernels/affine_grid_kernel.h +++ b/paddle/phi/kernels/affine_grid_kernel.h @@ -15,7 +15,6 @@ #pragma once #include "paddle/phi/common/int_array.h" -#include "paddle/phi/kernels/affine_grid_impl.h" namespace phi { diff --git a/paddle/phi/kernels/assign_kernel.h b/paddle/phi/kernels/assign_kernel.h index 0294dc950deb1..41be3e43a303d 100644 --- a/paddle/phi/kernels/assign_kernel.h +++ b/paddle/phi/kernels/assign_kernel.h @@ -18,6 +18,7 @@ #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" namespace phi { @@ -26,6 +27,16 @@ void AssignKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); +template +DenseTensor Assign(const Context& dev_ctx, const DenseTensor& x) { + DenseTensor out; + MetaTensor meta_out(&out); + MetaTensor meta_x(x); + UnchangedInferMeta(meta_x, &meta_out); + AssignKernel(dev_ctx, x, &out); + return out; +} + // In order to be compatible with the `AsDispensable` input in the original // assign op maker, the input parameter here needs to be dispensable, but // this looks weird diff --git a/paddle/phi/kernels/class_center_sample_kernel.h b/paddle/phi/kernels/class_center_sample_kernel.h new file mode 100644 index 0000000000000..61717e250c20f --- /dev/null +++ b/paddle/phi/kernels/class_center_sample_kernel.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ClassCenterSampleKernel(const Context& dev_ctx, + const DenseTensor& label, + int num_classes, + int num_samples, + int ring_id, + int rank, + int nranks, + bool fix_seed, + int seed, + DenseTensor* remapped_label, + DenseTensor* sampled_local_class_center); +} // namespace phi diff --git a/paddle/phi/kernels/cpu/affine_grid_grad_kernel.cc b/paddle/phi/kernels/cpu/affine_grid_grad_kernel.cc index 778a0adc9ca93..7bea3ff476b7d 100644 --- a/paddle/phi/kernels/cpu/affine_grid_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/affine_grid_grad_kernel.cc @@ -13,9 +13,11 @@ // limitations under the License. #include "paddle/phi/kernels/affine_grid_grad_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/affine_grid_utils.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/affine_grid_kernel.cc b/paddle/phi/kernels/cpu/affine_grid_kernel.cc index 6584a8eb263ea..712c2a1927719 100644 --- a/paddle/phi/kernels/cpu/affine_grid_kernel.cc +++ b/paddle/phi/kernels/cpu/affine_grid_kernel.cc @@ -13,9 +13,11 @@ // limitations under the License. #include "paddle/phi/kernels/affine_grid_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/affine_grid_utils.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/class_center_sample_kernel.cc b/paddle/phi/kernels/cpu/class_center_sample_kernel.cc new file mode 100644 index 0000000000000..6667ea05ab6ac --- /dev/null +++ b/paddle/phi/kernels/cpu/class_center_sample_kernel.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/generator.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void ClassCenterSampleKernel(const Context& dev_ctx, + const DenseTensor& label, + int num_classes, + int num_samples, + int ring_id, + int rank, + int nranks, + bool fix_seed, + int seed, + DenseTensor* remapped_label, + DenseTensor* sampled_local_class_center) { + PADDLE_ENFORCE_GT(num_classes, + 0, + errors::InvalidArgument( + "The value 'num_classes' for Op(class_center_sample) " + "must be greater than 0, " + "but the value given is %d.", + num_classes)); + + PADDLE_ENFORCE_GT(num_samples, + 0, + errors::InvalidArgument( + "The value 'num_samples' for Op(class_center_sample) " + "must be greater than 0, " + "but the value given is %d.", + num_samples)); + + PADDLE_ENFORCE_LE(num_samples, + num_classes, + errors::InvalidArgument( + "The value 'num_samples' for Op(class_center_sample) " + "must be less than or equal to %d, " + "but the value given is %d.", + num_classes, + num_samples)); + + int64_t numel = label.numel(); + auto* label_ptr = label.data(); + + // get unique positive class center by ascending + std::set> unique_label; + for (int64_t i = 0; i < numel; ++i) { + unique_label.insert(label_ptr[i]); + } + + // constrcut a lookup table and get sampled_local_class_center + std::vector actual_sampled; + std::map new_class_dict; + T idx = 0; + for (auto& t : unique_label) { + new_class_dict[t] = idx; + actual_sampled.push_back(t); + idx++; + } + + if (!fix_seed) { + std::random_device rnd; + seed = rnd(); + } + std::uniform_int_distribution dist(0, num_classes - 1); + auto engine = paddle::framework::GetCPURandomEngine(seed); + // sample negative class center randomly + while (unique_label.size() < static_cast(num_samples)) { + T neg = dist(*engine); + if (unique_label.find(neg) == unique_label.end()) { + unique_label.insert(neg); + // unorder for negative class center + actual_sampled.push_back(neg); + } + } + + int actual_num_samples = unique_label.size(); + sampled_local_class_center->Resize({actual_num_samples}); + T* sampled_local_class_center_ptr = + dev_ctx.template Alloc(sampled_local_class_center); + + idx = 0; + for (auto& t : actual_sampled) { + sampled_local_class_center_ptr[idx] = t; + idx++; + } + + // remap the input label to sampled class + auto* remmaped_label_ptr = dev_ctx.template Alloc(remapped_label); + for (int64_t i = 0; i < numel; ++i) { + remmaped_label_ptr[i] = new_class_dict[label_ptr[i]]; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(class_center_sample, + CPU, + ALL_LAYOUT, + phi::ClassCenterSampleKernel, + int64_t, + int) {} diff --git a/paddle/phi/kernels/cpu/compare_kernel.cc b/paddle/phi/kernels/cpu/compare_kernel.cc index 694b44c16d80e..ae6c3fd5cb020 100644 --- a/paddle/phi/kernels/cpu/compare_kernel.cc +++ b/paddle/phi/kernels/cpu/compare_kernel.cc @@ -80,7 +80,8 @@ PD_REGISTER_KERNEL(less_than, int, int64_t, float, - double) {} + double, + phi::dtype::float16) {} PD_REGISTER_KERNEL(less_equal, CPU, ALL_LAYOUT, @@ -90,7 +91,8 @@ PD_REGISTER_KERNEL(less_equal, int, int64_t, float, - double) {} + double, + phi::dtype::float16) {} PD_REGISTER_KERNEL(greater_than, CPU, ALL_LAYOUT, @@ -100,7 +102,8 @@ PD_REGISTER_KERNEL(greater_than, int, int64_t, float, - double) {} + double, + phi::dtype::float16) {} PD_REGISTER_KERNEL(greater_equal, CPU, ALL_LAYOUT, @@ -110,7 +113,8 @@ PD_REGISTER_KERNEL(greater_equal, int, int64_t, float, - double) {} + double, + phi::dtype::float16) {} PD_REGISTER_KERNEL(equal, CPU, ALL_LAYOUT, @@ -120,7 +124,8 @@ PD_REGISTER_KERNEL(equal, int, int64_t, float, - double) {} + double, + phi::dtype::float16) {} PD_REGISTER_KERNEL(not_equal, CPU, ALL_LAYOUT, @@ -130,7 +135,8 @@ PD_REGISTER_KERNEL(not_equal, int, int64_t, float, - double) {} + double, + phi::dtype::float16) {} PD_REGISTER_KERNEL(equal_all, CPU, diff --git a/paddle/phi/kernels/cpu/dropout_grad_kernel.cc b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc index 42b2834aaffc9..445e92716a899 100644 --- a/paddle/phi/kernels/cpu/dropout_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc @@ -24,7 +24,7 @@ template void DropoutNdGradKernel(const Context& dev_ctx, const DenseTensor& mask, const DenseTensor& out_grad, - float p, + const Scalar& p, bool is_test, const std::string& mode, const std::vector& axis, @@ -35,6 +35,7 @@ void DropoutNdGradKernel(const Context& dev_ctx, auto dX = EigenVector::Flatten(*grad_x); auto dY = EigenVector::Flatten(*grad_y); + float prob = p.to(); auto& place = *dev_ctx.eigen_device(); auto& dropout_implementation = mode; @@ -42,20 +43,20 @@ void DropoutNdGradKernel(const Context& dev_ctx, if (dropout_implementation == "upscale_in_train") { dX.device(place) = static_cast(1) * dY; } else { - dX.device(place) = dY * static_cast(1.0f - p); + dX.device(place) = dY * static_cast(1.0f - prob); } } else { std::vector out_dims = phi::vectorize(out_grad.dims()); auto M = EigenVector::Flatten(mask); if (dropout_implementation == "upscale_in_train") { - if (p == 1.0f) { + if (prob == 1.0f) { dX.device(place) = static_cast(0) * dY; } else { if (axis.empty()) { - dX.device(place) = dY * M.cast() / static_cast(1.0f - p); + dX.device(place) = dY * M.cast() / static_cast(1.0f - prob); } else { - dX.device(place) = - dY * M.broadcast(out_dims).cast() / static_cast(1.0f - p); + dX.device(place) = dY * M.broadcast(out_dims).cast() / + static_cast(1.0f - prob); } } } else { @@ -72,12 +73,12 @@ template void DropoutGradRawKernel(const Context& dev_ctx, const DenseTensor& mask, const DenseTensor& out_grad, - float p, + const Scalar& p, bool is_test, const std::string& mode, DenseTensor* x_grad) { DropoutNdGradKernel( - dev_ctx, mask, out_grad, p, is_test, mode, {}, x_grad); + dev_ctx, mask, out_grad, p.to(), is_test, mode, {}, x_grad); } } // namespace phi diff --git a/paddle/phi/kernels/cpu/dropout_kernel.cc b/paddle/phi/kernels/cpu/dropout_kernel.cc index d3ca21cfe33b9..41c33fcf5dd3f 100644 --- a/paddle/phi/kernels/cpu/dropout_kernel.cc +++ b/paddle/phi/kernels/cpu/dropout_kernel.cc @@ -25,7 +25,7 @@ namespace phi { template void ComputeDropoutInference(const Context& ctx, const DenseTensor& x, - float dropout_prob, + const Scalar& dropout_prob, bool upscale_in_train, DenseTensor* y) { if (upscale_in_train) { @@ -41,7 +41,7 @@ void ComputeDropoutInference(const Context& ctx, auto X = EigenMatrix::Reshape(x, 1); auto Y = EigenMatrix::Reshape(*y, 1); auto& place = *ctx.eigen_device(); - Y.device(place) = X * static_cast(1.0f - dropout_prob); + Y.device(place) = X * static_cast(1.0f - dropout_prob.to()); } } @@ -49,7 +49,7 @@ template void DropoutRawKernel(const Context& dev_ctx, const DenseTensor& x, const paddle::optional& seed_tensor, - float p, + const Scalar& p, bool is_test, const std::string& mode, int seed, @@ -59,7 +59,7 @@ void DropoutRawKernel(const Context& dev_ctx, auto* y = out; const auto* x_data = x.data(); T* y_data = dev_ctx.template Alloc(y); - float dropout_prob = p; + float dropout_prob = p.to(); auto& dropout_implementation = mode; bool upscale_in_train = (dropout_implementation == "upscale_in_train"); @@ -109,7 +109,7 @@ template void DropoutNdKernel(const Context& dev_ctx, const DenseTensor& x, const paddle::optional& seed_tensor, - float p, + const Scalar& p, bool is_test, const std::string& mode, int seed, @@ -120,7 +120,7 @@ void DropoutNdKernel(const Context& dev_ctx, auto* y = out; const auto* x_data = x.data(); T* y_data = dev_ctx.template Alloc(y); - float dropout_prob = p; + float dropout_prob = p.to(); auto& dropout_implementation = mode; bool upscale_in_train = (dropout_implementation == "upscale_in_train"); diff --git a/paddle/phi/kernels/cpu/fft_grad_kernel.cc b/paddle/phi/kernels/cpu/fft_grad_kernel.cc new file mode 100644 index 0000000000000..aecaf6c5c13f8 --- /dev/null +++ b/paddle/phi/kernels/cpu/fft_grad_kernel.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fft_grad_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/fft_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(fft_c2c_grad, + CPU, + ALL_LAYOUT, + phi::FFTC2CGradKernel, + phi::dtype::complex, + phi::dtype::complex) {} +PD_REGISTER_KERNEL( + fft_c2r_grad, CPU, ALL_LAYOUT, phi::FFTC2RGradKernel, float, double) {} +PD_REGISTER_KERNEL(fft_r2c_grad, + CPU, + ALL_LAYOUT, + phi::FFTR2CGradKernel, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/fft_kernel.cc b/paddle/phi/kernels/cpu/fft_kernel.cc new file mode 100644 index 0000000000000..f5e3e350fc085 --- /dev/null +++ b/paddle/phi/kernels/cpu/fft_kernel.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fft_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/fft_kernel_impl.h" + +PD_REGISTER_KERNEL(fft_c2c, + CPU, + ALL_LAYOUT, + phi::FFTC2CKernel, + phi::dtype::complex, + phi::dtype::complex) {} +PD_REGISTER_KERNEL(fft_c2r, + CPU, + ALL_LAYOUT, + phi::FFTC2RKernel, + phi::dtype::complex, + phi::dtype::complex) {} +PD_REGISTER_KERNEL(fft_r2c, CPU, ALL_LAYOUT, phi::FFTR2CKernel, float, double) { +} diff --git a/paddle/phi/kernels/cpu/fill_diagonal_grad_kernel.cc b/paddle/phi/kernels/cpu/fill_diagonal_grad_kernel.cc index 1291a677bf9de..0fe9c50dc15e8 100644 --- a/paddle/phi/kernels/cpu/fill_diagonal_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/fill_diagonal_grad_kernel.cc @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/fill_diagonal_grad_kernel.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/common_shape.h" namespace phi { @@ -30,7 +31,7 @@ void FillDiagonalGradKernel(const Context& ctx, phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad); auto dx_dims = x_grad->dims(); - auto strides = CalStride(dx_dims); + auto strides = funcs::CalStride(dx_dims); auto size = x_grad->numel(); auto wrapsize = std::min(size, dx_dims[1] * dx_dims[1]); diff --git a/paddle/phi/kernels/cpu/fill_diagonal_kernel.cc b/paddle/phi/kernels/cpu/fill_diagonal_kernel.cc index 232f2444cf409..c5888f5d30ed2 100644 --- a/paddle/phi/kernels/cpu/fill_diagonal_kernel.cc +++ b/paddle/phi/kernels/cpu/fill_diagonal_kernel.cc @@ -16,6 +16,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/funcs/common_shape.h" namespace phi { @@ -32,7 +33,7 @@ void FillDiagonalKernel(const Context& ctx, phi::Copy(ctx, x, ctx.GetPlace(), false, out); auto out_dims = out->dims(); - auto strides = CalStride(out_dims); + auto strides = funcs::CalStride(out_dims); auto size = out->numel(); // The wrap mode supported only the dims equels to 2; In wrap mode, the diff --git a/paddle/phi/kernels/cpu/fill_diagonal_tensor_grad_kernel.cc b/paddle/phi/kernels/cpu/fill_diagonal_tensor_grad_kernel.cc new file mode 100644 index 0000000000000..318e2016097fe --- /dev/null +++ b/paddle/phi/kernels/cpu/fill_diagonal_tensor_grad_kernel.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fill_diagonal_tensor_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void FillDiagonalTensorGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int64_t offset, + int dim1, + int dim2, + DenseTensor* x_grad) { + auto matrows = 1; + + if (x_grad) { + auto* data = ctx.template Alloc(x_grad); + + auto dx_dims = x_grad->dims(); + for (int i = 0; i < dx_dims.size(); i++) { + if (i != dim1 && i != dim2) { + matrows *= dx_dims[i]; + } + } + + int64_t new_dims[2], strides[2]; + std::vector matdim; + matdim.resize(matrows); + CalMatDims(dx_dims, dim1, dim2, &offset, new_dims, strides, matdim.data()); + + auto size = x_grad->numel(); + phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad); + + for (int64_t i = 0; i < new_dims[0]; i += 1) { + auto sumoff = matdim[i] + offset; + for (int64_t j = 0; j < new_dims[1]; j += 1) { + auto fill_index = j * (strides[1] + strides[0]) + sumoff; + if (fill_index < size) { + data[fill_index] = 0; + } + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(fill_diagonal_tensor_grad, + CPU, + ALL_LAYOUT, + phi::FillDiagonalTensorGradKernel, + float, + double, + int64_t, + int, + int8_t, + uint8_t, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex, + bool) {} diff --git a/paddle/phi/kernels/cpu/fill_diagonal_tensor_kernel.cc b/paddle/phi/kernels/cpu/fill_diagonal_tensor_kernel.cc new file mode 100644 index 0000000000000..4e8030199d16a --- /dev/null +++ b/paddle/phi/kernels/cpu/fill_diagonal_tensor_kernel.cc @@ -0,0 +1,140 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fill_diagonal_tensor_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_utils.h" + +namespace phi { + +void CalMatDims(phi::DDim out_dims, + int dim1, + int dim2, + int64_t *offset, + int64_t *new_dims, + int64_t *strides, + int64_t *matoffset) { + int64_t dimprod = 1, batchdim = 1; + int rank = out_dims.size(); + int matoffidx = 0; + for (int i = rank - 1; i >= 0; i--) { + if (i == dim2) { + strides[0] = dimprod; + } else if (i == dim1) { + strides[1] = dimprod; + } else { + batchdim *= out_dims[i]; + // matoffset calculate the offset position of the diagonal defined by dim1 + // and dim2 + // the first circle calculate the final free dimension + // and then calculate the front free dim one by one + if (matoffidx == 0) { + for (int64_t j = 0; j < out_dims[i]; j++) { + matoffset[matoffidx] = dimprod * j; + matoffidx++; + } + } else { + auto size = matoffidx; + for (int64_t j = 1; j < out_dims[i]; j++) { + for (int64_t k = 0; k < size; k++) { + matoffset[matoffidx] = matoffset[k] + dimprod * j; + matoffidx++; + } + } + } + } + dimprod *= out_dims[i]; + } + + auto diagdim = dim1; + if (*offset >= 0) { + diagdim = std::min(out_dims[dim1], out_dims[dim2] - *offset); + *offset *= strides[0]; + } else { + diagdim = std::min(out_dims[dim1] + *offset, out_dims[dim2]); + *offset *= -strides[1]; + } + new_dims[0] = batchdim; + new_dims[1] = diagdim; + return; +} + +template +void FillDiagonalTensorKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &y, + int64_t offset, + int dim1, + int dim2, + DenseTensor *out) { + T *out_data = ctx.template Alloc(out); + const T *fill_data = y.data(); + + phi::Copy(ctx, x, ctx.GetPlace(), false, out); + auto out_dims = out->dims(); + auto matdims = y.dims(); + auto fill_dims = phi::flatten_to_2d(matdims, matdims.size() - 1); + + int64_t new_dims[2], strides[2]; + std::vector matdim; + matdim.resize(fill_dims[0]); + CalMatDims(out_dims, dim1, dim2, &offset, new_dims, strides, matdim.data()); + PADDLE_ENFORCE_EQ( + new_dims[0], + fill_dims[0], + errors::InvalidArgument("The dims should be %d x %d, but get " + "%d x %d in fill tensor Y", + new_dims[0], + new_dims[1], + fill_dims[0], + fill_dims[1])); + PADDLE_ENFORCE_EQ( + new_dims[1], + fill_dims[1], + errors::InvalidArgument("The dims should be %d x %d, but get " + "%d x %d in fill tensor Y", + new_dims[0], + new_dims[1], + fill_dims[0], + fill_dims[1])); + + auto size = out->numel(); + for (int64_t i = 0; i < fill_dims[0]; i += 1) { + auto sumoff = matdim[i] + offset; + for (int64_t j = 0; j < fill_dims[1]; j += 1) { + auto fill_index = j * (strides[1] + strides[0]) + sumoff; + if (fill_index < size) { + out_data[fill_index] = fill_data[i * fill_dims[1] + j]; + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(fill_diagonal_tensor, + CPU, + ALL_LAYOUT, + phi::FillDiagonalTensorKernel, + float, + double, + int64_t, + int, + int8_t, + uint8_t, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex, + bool) {} diff --git a/paddle/phi/kernels/cpu/fill_grad_kernel.cc b/paddle/phi/kernels/cpu/fill_grad_kernel.cc new file mode 100644 index 0000000000000..ee676773762ca --- /dev/null +++ b/paddle/phi/kernels/cpu/fill_grad_kernel.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/fill_grad_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(fill_grad, + CPU, + ALL_LAYOUT, + phi::FillGradKernel, + float, + double, + int64_t, + int, + paddle::platform::float16, + bool) {} diff --git a/paddle/phi/kernels/cpu/fill_kernel.cc b/paddle/phi/kernels/cpu/fill_kernel.cc new file mode 100644 index 0000000000000..ee8dac7f6770c --- /dev/null +++ b/paddle/phi/kernels/cpu/fill_kernel.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/fill_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(fill, + CPU, + ALL_LAYOUT, + phi::FillKernel, + float, + double, + int64_t, + int, + paddle::platform::float16, + bool) {} diff --git a/paddle/phi/kernels/cpu/fold_grad_kernel.cc b/paddle/phi/kernels/cpu/fold_grad_kernel.cc new file mode 100644 index 0000000000000..0c3f1dda03e5e --- /dev/null +++ b/paddle/phi/kernels/cpu/fold_grad_kernel.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fold_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/fold_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + fold_grad, CPU, ALL_LAYOUT, phi::FoldGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/fold_kernel.cc b/paddle/phi/kernels/cpu/fold_kernel.cc new file mode 100644 index 0000000000000..e22ac4c771ed9 --- /dev/null +++ b/paddle/phi/kernels/cpu/fold_kernel.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fold_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/fold_kernel_impl.h" + +PD_REGISTER_KERNEL(fold, CPU, ALL_LAYOUT, phi::FoldKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/generate_proposals_v2_kernel.cc b/paddle/phi/kernels/cpu/generate_proposals_v2_kernel.cc new file mode 100644 index 0000000000000..22f39555449a1 --- /dev/null +++ b/paddle/phi/kernels/cpu/generate_proposals_v2_kernel.cc @@ -0,0 +1,392 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/generate_proposals_v2_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/detection/nms_util.h" +#include "paddle/phi/kernels/funcs/gather.h" + +namespace phi { + +static const double kBBoxClipDefault = std::log(1000.0 / 16.0); + +static void AppendProposals(DenseTensor* dst, + int64_t offset, + const DenseTensor& src) { + auto* out_data = dst->data(); + auto* to_add_data = src.data(); + size_t size_of_t = SizeOf(src.dtype()); + offset *= size_of_t; + std::memcpy( + reinterpret_cast(reinterpret_cast(out_data) + offset), + to_add_data, + src.numel() * size_of_t); +} + +template +void ClipTiledBoxes(const phi::CPUContext& ctx, + const DenseTensor& im_info, + const DenseTensor& input_boxes, + DenseTensor* out, + bool is_scale = true, + bool pixel_offset = true) { + T* out_data = ctx.template Alloc(out); + const T* im_info_data = im_info.data(); + const T* input_boxes_data = input_boxes.data(); + T offset = pixel_offset ? static_cast(1.0) : 0; + T zero(0); + T im_w = + is_scale ? round(im_info_data[1] / im_info_data[2]) : im_info_data[1]; + T im_h = + is_scale ? round(im_info_data[0] / im_info_data[2]) : im_info_data[0]; + for (int64_t i = 0; i < input_boxes.numel(); ++i) { + if (i % 4 == 0) { + out_data[i] = + std::max(std::min(input_boxes_data[i], im_w - offset), zero); + } else if (i % 4 == 1) { + out_data[i] = + std::max(std::min(input_boxes_data[i], im_h - offset), zero); + } else if (i % 4 == 2) { + out_data[i] = + std::max(std::min(input_boxes_data[i], im_w - offset), zero); + } else { + out_data[i] = + std::max(std::min(input_boxes_data[i], im_h - offset), zero); + } + } +} + +// Filter the box with small area +template +void FilterBoxes(const phi::CPUContext& ctx, + const DenseTensor* boxes, + float min_size, + const DenseTensor& im_info, + bool is_scale, + DenseTensor* keep, + bool pixel_offset = true) { + const T* im_info_data = im_info.data(); + const T* boxes_data = boxes->data(); + keep->Resize(phi::make_ddim({boxes->dims()[0]})); + min_size = std::max(min_size, 1.0f); + int* keep_data = ctx.template Alloc(keep); + T offset = pixel_offset ? static_cast(1.0) : 0; + + int keep_len = 0; + for (int i = 0; i < boxes->dims()[0]; ++i) { + T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + offset; + T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + offset; + if (pixel_offset) { + T x_ctr = boxes_data[4 * i] + ws / 2; + T y_ctr = boxes_data[4 * i + 1] + hs / 2; + + if (is_scale) { + ws = (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_info_data[2] + 1; + hs = (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_info_data[2] + + 1; + } + if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] && + y_ctr <= im_info_data[0]) { + keep_data[keep_len++] = i; + } + } else { + if (ws >= min_size && hs >= min_size) { + keep_data[keep_len++] = i; + } + } + } + keep->Resize(phi::make_ddim({keep_len})); +} + +template +static void BoxCoder(const phi::CPUContext& ctx, + DenseTensor* all_anchors, + DenseTensor* bbox_deltas, + DenseTensor* variances, + DenseTensor* proposals, + const bool pixel_offset = true) { + T* proposals_data = ctx.template Alloc(proposals); + + int64_t row = all_anchors->dims()[0]; + int64_t len = all_anchors->dims()[1]; + + auto* bbox_deltas_data = bbox_deltas->data(); + auto* anchor_data = all_anchors->data(); + const T* variances_data = nullptr; + if (variances) { + variances_data = variances->data(); + } + + T offset = pixel_offset ? static_cast(1.0) : 0; + for (int64_t i = 0; i < row; ++i) { + T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + offset; + T anchor_height = + anchor_data[i * len + 3] - anchor_data[i * len + 1] + offset; + + T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width; + T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height; + + T bbox_center_x = 0, bbox_center_y = 0; + T bbox_width = 0, bbox_height = 0; + + if (variances) { + bbox_center_x = + variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width + + anchor_center_x; + bbox_center_y = variances_data[i * len + 1] * + bbox_deltas_data[i * len + 1] * anchor_height + + anchor_center_y; + bbox_width = std::exp(std::min(variances_data[i * len + 2] * + bbox_deltas_data[i * len + 2], + kBBoxClipDefault)) * + anchor_width; + bbox_height = std::exp(std::min(variances_data[i * len + 3] * + bbox_deltas_data[i * len + 3], + kBBoxClipDefault)) * + anchor_height; + } else { + bbox_center_x = + bbox_deltas_data[i * len] * anchor_width + anchor_center_x; + bbox_center_y = + bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; + bbox_width = std::exp(std::min(bbox_deltas_data[i * len + 2], + kBBoxClipDefault)) * + anchor_width; + bbox_height = std::exp(std::min(bbox_deltas_data[i * len + 3], + kBBoxClipDefault)) * + anchor_height; + } + + proposals_data[i * len] = bbox_center_x - bbox_width / 2; + proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; + proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - offset; + proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - offset; + } + // return proposals; +} + +template +std::pair ProposalForOneImage( + const phi::CPUContext& ctx, + const DenseTensor& im_shape_slice, + const DenseTensor& anchors, + const DenseTensor& variances, + const DenseTensor& bbox_deltas_slice, // [M, 4] + const DenseTensor& scores_slice, // [N, 1] + int pre_nms_top_n, + int post_nms_top_n, + float nms_thresh, + float min_size, + float eta, + bool pixel_offset = true) { + auto* scores_data = scores_slice.data(); + + // Sort index + DenseTensor index_t; + index_t.Resize(phi::make_ddim({scores_slice.numel()})); + int* index = ctx.template Alloc(&index_t); + for (int i = 0; i < scores_slice.numel(); ++i) { + index[i] = i; + } + auto compare = [scores_data](const int64_t& i, const int64_t& j) { + return scores_data[i] > scores_data[j]; + }; + + if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) { + std::sort(index, index + scores_slice.numel(), compare); + } else { + std::nth_element( + index, index + pre_nms_top_n, index + scores_slice.numel(), compare); + index_t.Resize(phi::make_ddim({pre_nms_top_n})); + } + + DenseTensor scores_sel, bbox_sel, anchor_sel, var_sel; + scores_sel.Resize(phi::make_ddim({index_t.numel(), 1})); + ctx.template Alloc(&scores_sel); + + bbox_sel.Resize(phi::make_ddim({index_t.numel(), 4})); + ctx.template Alloc(&bbox_sel); + + anchor_sel.Resize(phi::make_ddim({index_t.numel(), 4})); + ctx.template Alloc(&anchor_sel); + + var_sel.Resize(phi::make_ddim({index_t.numel(), 4})); + ctx.template Alloc(&var_sel); + + phi::funcs::CPUGather(ctx, scores_slice, index_t, &scores_sel); + phi::funcs::CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); + phi::funcs::CPUGather(ctx, anchors, index_t, &anchor_sel); + phi::funcs::CPUGather(ctx, variances, index_t, &var_sel); + + DenseTensor proposals; + proposals.Resize(phi::make_ddim({index_t.numel(), 4})); + ctx.template Alloc(&proposals); + + BoxCoder(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals, pixel_offset); + + ClipTiledBoxes( + ctx, im_shape_slice, proposals, &proposals, false, pixel_offset); + + DenseTensor keep; + FilterBoxes( + ctx, &proposals, min_size, im_shape_slice, false, &keep, pixel_offset); + // Handle the case when there is no keep index left + if (keep.numel() == 0) { + phi::funcs::SetConstant set_zero; + bbox_sel.Resize(phi::make_ddim({1, 4})); + ctx.template Alloc(&bbox_sel); + set_zero(ctx, &bbox_sel, static_cast(0)); + DenseTensor scores_filter; + scores_filter.Resize(phi::make_ddim({1, 1})); + ctx.template Alloc(&scores_filter); + set_zero(ctx, &scores_filter, static_cast(0)); + return std::make_pair(bbox_sel, scores_filter); + } + + DenseTensor scores_filter; + bbox_sel.Resize(phi::make_ddim({keep.numel(), 4})); + ctx.template Alloc(&bbox_sel); + scores_filter.Resize(phi::make_ddim({keep.numel(), 1})); + ctx.template Alloc(&scores_filter); + phi::funcs::CPUGather(ctx, proposals, keep, &bbox_sel); + phi::funcs::CPUGather(ctx, scores_sel, keep, &scores_filter); + if (nms_thresh <= 0) { + return std::make_pair(bbox_sel, scores_filter); + } + + DenseTensor keep_nms = phi::funcs::NMS( + ctx, &bbox_sel, &scores_filter, nms_thresh, eta, pixel_offset); + + if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { + keep_nms.Resize(phi::make_ddim({post_nms_top_n})); + } + + proposals.Resize(phi::make_ddim({keep_nms.numel(), 4})); + ctx.template Alloc(&proposals); + scores_sel.Resize(phi::make_ddim({keep_nms.numel(), 1})); + ctx.template Alloc(&scores_sel); + phi::funcs::CPUGather(ctx, bbox_sel, keep_nms, &proposals); + phi::funcs::CPUGather(ctx, scores_filter, keep_nms, &scores_sel); + + return std::make_pair(proposals, scores_sel); +} + +template +void GenerateProposalsV2Kernel(const Context& ctx, + const DenseTensor& scores, + const DenseTensor& bbox_deltas, + const DenseTensor& im_shape, + const DenseTensor& anchors, + const DenseTensor& variances, + int pre_nms_top_n, + int post_nms_top_n, + float nms_thresh, + float min_size, + float eta, + bool pixel_offset, + DenseTensor* rpn_rois, + DenseTensor* rpn_roi_probs, + DenseTensor* rpn_rois_num) { + auto& scores_dim = scores.dims(); + int64_t num = scores_dim[0]; + int64_t c_score = scores_dim[1]; + int64_t h_score = scores_dim[2]; + int64_t w_score = scores_dim[3]; + + auto& bbox_dim = bbox_deltas.dims(); + int64_t c_bbox = bbox_dim[1]; + int64_t h_bbox = bbox_dim[2]; + int64_t w_bbox = bbox_dim[3]; + + rpn_rois->Resize(phi::make_ddim({bbox_deltas.numel() / 4, 4})); + ctx.template Alloc(rpn_rois); + + rpn_roi_probs->Resize(phi::make_ddim({scores.numel(), 1})); + ctx.template Alloc(rpn_roi_probs); + + DenseTensor bbox_deltas_swap, scores_swap; + bbox_deltas_swap.Resize(phi::make_ddim({num, h_bbox, w_bbox, c_bbox})); + ctx.template Alloc(&bbox_deltas_swap); + + scores_swap.Resize(phi::make_ddim({num, h_score, w_score, c_score})); + ctx.template Alloc(&scores_swap); + + phi::funcs::Transpose trans; + std::vector axis = {0, 2, 3, 1}; + trans(ctx, bbox_deltas, &bbox_deltas_swap, axis); + trans(ctx, scores, &scores_swap, axis); + + phi::LoD lod; + lod.resize(1); + auto& lod0 = lod[0]; + lod0.push_back(0); + DenseTensor tmp_anchors = anchors; + DenseTensor tmp_variances = variances; + tmp_anchors.Resize(phi::make_ddim({tmp_anchors.numel() / 4, 4})); + tmp_variances.Resize(phi::make_ddim({tmp_variances.numel() / 4, 4})); + std::vector tmp_num; + + int64_t num_proposals = 0; + for (int64_t i = 0; i < num; ++i) { + DenseTensor im_shape_slice = im_shape.Slice(i, i + 1); + DenseTensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); + DenseTensor scores_slice = scores_swap.Slice(i, i + 1); + + bbox_deltas_slice.Resize(phi::make_ddim({h_bbox * w_bbox * c_bbox / 4, 4})); + scores_slice.Resize(phi::make_ddim({h_score * w_score * c_score, 1})); + + std::pair tensor_pair = + ProposalForOneImage(ctx, + im_shape_slice, + tmp_anchors, + tmp_variances, + bbox_deltas_slice, + scores_slice, + pre_nms_top_n, + post_nms_top_n, + nms_thresh, + min_size, + eta, + pixel_offset); + DenseTensor& proposals = tensor_pair.first; + DenseTensor& nscores = tensor_pair.second; + + AppendProposals(rpn_rois, 4 * num_proposals, proposals); + AppendProposals(rpn_roi_probs, num_proposals, nscores); + num_proposals += proposals.dims()[0]; + lod0.push_back(num_proposals); + tmp_num.push_back(proposals.dims()[0]); + } + if (rpn_rois_num != nullptr) { + rpn_rois_num->Resize(phi::make_ddim({num})); + ctx.template Alloc(rpn_rois_num); + int* num_data = rpn_rois_num->data(); + for (int i = 0; i < num; i++) { + num_data[i] = tmp_num[i]; + } + rpn_rois_num->Resize(phi::make_ddim({num})); + } + rpn_rois->Resize(phi::make_ddim({num_proposals, 4})); + rpn_roi_probs->Resize(phi::make_ddim({num_proposals, 1})); +} + +} // namespace phi + +PD_REGISTER_KERNEL(generate_proposals_v2, + CPU, + ALL_LAYOUT, + phi::GenerateProposalsV2Kernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc index e4034230c7866..d4b9c8c60e3f8 100644 --- a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc @@ -88,27 +88,35 @@ void GraphSendRecvOpKernelLaunchHelper(const Context& ctx, DenseTensor* dst_count = nullptr) { const int& index_size = src_index.dims()[0]; - ctx.template Alloc(out); - T* p_output = out->data(); const auto& src_dims = x.dims(); int64_t memset_size = 1; if (out_size <= 0) { + out->Resize(src_dims); for (int i = 0; i < src_dims.size(); ++i) { memset_size *= src_dims[i]; } } else { + // Set out dim following out_size. + std::vector dims_ = phi::vectorize(src_dims); + if (dims_.size() > 0) { + dims_[0] = out_size; + } + out->Resize(phi::make_ddim(dims_)); memset_size = out_size; for (int i = 1; i < src_dims.size(); ++i) { memset_size *= src_dims[i]; } } + + ctx.template Alloc(out); + T* p_output = out->data(); const size_t& memset_bytes = memset_size * sizeof(T); memset(p_output, 0, memset_bytes); if (index_size == 0) return; - const IndexT* s_index = src_index.data(); const IndexT* d_index = dst_index.data(); + if (pool_type == "SUM") { GraphSendRecvCpuLoop>( src_dims[0], index_size, s_index, d_index, x, out, pool_type); @@ -119,10 +127,12 @@ void GraphSendRecvOpKernelLaunchHelper(const Context& ctx, GraphSendRecvCpuLoop>( src_dims[0], index_size, s_index, d_index, x, out, pool_type); } else if (pool_type == "MEAN") { + int64_t input_size = out_size <= 0 ? src_dims[0] : out_size; + dst_count->Resize({input_size}); ctx.template Alloc(dst_count); int* p_dst_count = dst_count->data(); - memset(p_dst_count, 0, src_dims[0] * sizeof(int)); - GraphSendRecvCpuLoop>(src_dims[0], + memset(p_dst_count, 0, input_size * sizeof(int)); + GraphSendRecvCpuLoop>(input_size, index_size, s_index, d_index, @@ -139,16 +149,29 @@ void GraphSendRecvKernel(const Context& ctx, const DenseTensor& src_index, const DenseTensor& dst_index, const std::string& pool_type, - int64_t out_size, + const IntArray& out_size, DenseTensor* out, DenseTensor* dst_count) { auto index_type = src_index.dtype(); + auto& out_size_data = out_size.GetData(); if (index_type == phi::DataType::INT32) { - GraphSendRecvOpKernelLaunchHelper( - ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count); + GraphSendRecvOpKernelLaunchHelper(ctx, + x, + src_index, + dst_index, + pool_type, + out_size_data[0], + out, + dst_count); } else if (index_type == phi::DataType::INT64) { - GraphSendRecvOpKernelLaunchHelper( - ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count); + GraphSendRecvOpKernelLaunchHelper(ctx, + x, + src_index, + dst_index, + pool_type, + out_size_data[0], + out, + dst_count); } } diff --git a/paddle/phi/kernels/cpu/lamb_kernel.cc b/paddle/phi/kernels/cpu/lamb_kernel.cc new file mode 100644 index 0000000000000..1394f8e5b910c --- /dev/null +++ b/paddle/phi/kernels/cpu/lamb_kernel.cc @@ -0,0 +1,20 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/lamb_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/lamb_kernel_impl.h" + +PD_REGISTER_KERNEL(lamb, CPU, ALL_LAYOUT, phi::LambKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/margin_cross_entropy_kernel.cc b/paddle/phi/kernels/cpu/margin_cross_entropy_kernel.cc new file mode 100644 index 0000000000000..06d74471dd992 --- /dev/null +++ b/paddle/phi/kernels/cpu/margin_cross_entropy_kernel.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/margin_cross_entropy_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void MarginCrossEntropyKernel(const Context& dev_ctx, + const DenseTensor& logits, + const DenseTensor& labels, + bool return_softmax, + int ring_id, + int rank, + int nranks, + float margin1, + float margin2, + float margin3, + float scale, + DenseTensor* softmax, + DenseTensor* loss) { + PADDLE_THROW( + errors::Unavailable("Do not support margin_cross_entropy for cpu kernel " + "now.")); +} + +} // namespace phi + +PD_REGISTER_KERNEL(margin_cross_entropy, + CPU, + ALL_LAYOUT, + phi::MarginCrossEntropyKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/cpu/multiclass_nms3_kernel.cc b/paddle/phi/kernels/cpu/multiclass_nms3_kernel.cc index ff6e2a372791e..dc82ffbea8791 100644 --- a/paddle/phi/kernels/cpu/multiclass_nms3_kernel.cc +++ b/paddle/phi/kernels/cpu/multiclass_nms3_kernel.cc @@ -252,7 +252,6 @@ void SliceOneClass(const Context& ctx, const DenseTensor& items, const int class_id, DenseTensor* one_class_item) { - // T* item_data = one_class_item->mutable_data(ctx.GetPlace()); T* item_data = ctx.template Alloc(one_class_item); const T* items_data = items.data(); const int64_t num_item = items.dims()[0]; diff --git a/paddle/phi/kernels/cpu/scale_kernel.cc b/paddle/phi/kernels/cpu/scale_kernel.cc index e929b5bd7219b..421aae270ee59 100644 --- a/paddle/phi/kernels/cpu/scale_kernel.cc +++ b/paddle/phi/kernels/cpu/scale_kernel.cc @@ -62,4 +62,6 @@ PD_REGISTER_KERNEL(scale, int8_t, int16_t, int, - int64_t) {} + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/unpool_grad_kernel.cc b/paddle/phi/kernels/cpu/unpool_grad_kernel.cc new file mode 100644 index 0000000000000..e09082f7ba80f --- /dev/null +++ b/paddle/phi/kernels/cpu/unpool_grad_kernel.cc @@ -0,0 +1,137 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/unpool_grad_kernel.h" + +#include +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void UnpoolGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out, + const DenseTensor& out_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_size, + const std::string& data_format, + DenseTensor* x_grad) { + T* input_grad_data = dev_ctx.template Alloc(x_grad); + const T* output_grad_data = out_grad.data(); + phi::funcs::SetConstant zero; + zero(dev_ctx, x_grad, static_cast(0)); + const int batch_size = x.dims()[0]; + const int input_height = x.dims()[2]; + const int input_width = x.dims()[3]; + const int output_channels = out.dims()[1]; + const int output_height = out.dims()[2]; + const int output_width = out.dims()[3]; + int input_feasize = input_height * input_width; + int output_feasize = output_height * output_width; + const int* indices_data = indices.data(); + + for (int b = 0; b < batch_size; ++b) { + for (int c = 0; c < output_channels; ++c) { + for (int i = 0; i < input_feasize; ++i) { + int index = indices_data[i]; + PADDLE_ENFORCE_LT( + index, + output_feasize, + phi::errors::InvalidArgument( + "index should less than output tensor height * output tensor " + "width. Expected %ld < %ld, but got " + "%ld >= %ld. Please check input value.", + index, + output_feasize, + index, + output_feasize)); + input_grad_data[i] = output_grad_data[index]; + } + input_grad_data += input_feasize; + indices_data += input_feasize; + output_grad_data += output_feasize; + } + } +} + +template +void Unpool3dGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out, + const DenseTensor& out_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_size, + const std::string& data_format, + DenseTensor* x_grad) { + T* input_grad_data = dev_ctx.template Alloc(x_grad); + const T* output_grad_data = out_grad.data(); + phi::funcs::SetConstant zero; + zero(dev_ctx, x_grad, static_cast(0)); + + const int batch_size = x.dims()[0]; + const int input_depth = x.dims()[2]; + const int input_height = x.dims()[3]; + const int input_width = x.dims()[4]; + const int output_channels = out.dims()[1]; + const int output_depth = out.dims()[2]; + const int output_height = out.dims()[3]; + const int output_width = out.dims()[4]; + int input_feasize = input_depth * input_height * input_width; + int output_feasize = output_depth * output_height * output_width; + const int* indices_data = indices.data(); + + for (int b = 0; b < batch_size; ++b) { + for (int c = 0; c < output_channels; ++c) { + for (int i = 0; i < input_feasize; ++i) { + int index = indices_data[i]; + PADDLE_ENFORCE_LT( + index, + output_feasize, + phi::errors::InvalidArgument( + "index should less than output tensor depth * output tensor " + "height " + "* output tensor width. Expected %ld < %ld, but got " + "%ld >= %ld. Please check input value.", + index, + output_feasize, + index, + output_feasize)); + input_grad_data[i] = output_grad_data[index]; + } + input_grad_data += input_feasize; + indices_data += input_feasize; + output_grad_data += output_feasize; + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + unpool_grad, CPU, ALL_LAYOUT, phi::UnpoolGradKernel, float, double) {} + +PD_REGISTER_KERNEL( + unpool3d_grad, CPU, ALL_LAYOUT, phi::Unpool3dGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/unpool_kernel.cc b/paddle/phi/kernels/cpu/unpool_kernel.cc new file mode 100644 index 0000000000000..3ec0c6222348f --- /dev/null +++ b/paddle/phi/kernels/cpu/unpool_kernel.cc @@ -0,0 +1,132 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/unpool_kernel.h" + +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void UnpoolKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_size, + const std::string& data_format, + DenseTensor* out) { + T* output_data = dev_ctx.template Alloc(out); + if (output_data) { + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, out, static_cast(0)); + } + const int batch_size = x.dims()[0]; + const int input_height = x.dims()[2]; + const int input_width = x.dims()[3]; + const int output_channels = out->dims()[1]; + const int output_height = out->dims()[2]; + const int output_width = out->dims()[3]; + int input_feasize = input_height * input_width; + int output_feasize = output_height * output_width; + const T* input_data = x.data(); + const int* indices_data = indices.data(); + for (int b = 0; b < batch_size; ++b) { + for (int c = 0; c < output_channels; ++c) { + for (int i = 0; i < input_feasize; ++i) { + int index = indices_data[i]; + PADDLE_ENFORCE_LT( + index, + output_feasize, + phi::errors::InvalidArgument( + "index should less than output tensor height * output tensor " + "width. Expected %ld < %ld, but got " + "%ld >= %ld. Please check input value.", + index, + output_feasize, + index, + output_feasize)); + output_data[index] = input_data[i]; + } + input_data += input_feasize; + indices_data += input_feasize; + output_data += output_feasize; + } + } +} + +template +void Unpool3dKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_size, + const std::string& data_format, + DenseTensor* out) { + T* output_data = dev_ctx.template Alloc(out); + if (output_data) { + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, out, static_cast(0)); + } + const int batch_size = x.dims()[0]; + const int input_depth = x.dims()[2]; + const int input_height = x.dims()[3]; + const int input_width = x.dims()[4]; + const int output_channels = out->dims()[1]; + const int output_depth = out->dims()[2]; + const int output_height = out->dims()[3]; + const int output_width = out->dims()[4]; + int input_feasize = input_depth * input_height * input_width; + int output_feasize = output_depth * output_height * output_width; + const T* input_data = x.data(); + const int* indices_data = indices.data(); + for (int b = 0; b < batch_size; ++b) { + for (int c = 0; c < output_channels; ++c) { + for (int i = 0; i < input_feasize; ++i) { + int index = indices_data[i]; + PADDLE_ENFORCE_LT( + index, + output_feasize, + phi::errors::InvalidArgument( + "index should less than output tensor depth * output tensor " + "height " + "* output tensor width. Expected %ld < %ld, but got " + "%ld >= %ld. Please check input value.", + index, + output_feasize, + index, + output_feasize)); + output_data[index] = input_data[i]; + } + input_data += input_feasize; + indices_data += input_feasize; + output_data += output_feasize; + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(unpool, CPU, ALL_LAYOUT, phi::UnpoolKernel, float, double) {} + +PD_REGISTER_KERNEL( + unpool3d, CPU, ALL_LAYOUT, phi::Unpool3dKernel, float, double) {} diff --git a/paddle/phi/kernels/dropout_grad_kernel.h b/paddle/phi/kernels/dropout_grad_kernel.h index d8d5363ad59b7..c61e4d0b8598d 100644 --- a/paddle/phi/kernels/dropout_grad_kernel.h +++ b/paddle/phi/kernels/dropout_grad_kernel.h @@ -23,7 +23,7 @@ template void DropoutGradRawKernel(const Context& dev_ctx, const DenseTensor& mask, const DenseTensor& out_grad, - float p, + const Scalar& p, bool is_test, const std::string& mode, DenseTensor* x_grad); @@ -32,7 +32,7 @@ template void DropoutNdGradKernel(const Context& dev_ctx, const DenseTensor& mask, const DenseTensor& out_grad, - float p, + const Scalar& p, bool is_test, const std::string& mode, const std::vector& axis, diff --git a/paddle/phi/kernels/dropout_kernel.h b/paddle/phi/kernels/dropout_kernel.h index cba8160058e99..ff718d641bedc 100644 --- a/paddle/phi/kernels/dropout_kernel.h +++ b/paddle/phi/kernels/dropout_kernel.h @@ -24,7 +24,7 @@ template void DropoutRawKernel(const Context& dev_ctx, const DenseTensor& x, const paddle::optional& seed_tensor, - float p, + const Scalar& p, bool is_test, const std::string& mode, int seed, @@ -36,7 +36,7 @@ template void DropoutNdKernel(const Context& dev_ctx, const DenseTensor& x, const paddle::optional& seed_tensor, - float p, + const Scalar& p, bool is_test, const std::string& mode, int seed, diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc index d8cf0bd2ef90d..2c969cc43d2f1 100644 --- a/paddle/phi/kernels/empty_kernel.cc +++ b/paddle/phi/kernels/empty_kernel.cc @@ -110,3 +110,19 @@ PD_REGISTER_KERNEL(empty_like, kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); } #endif + +#ifdef PADDLE_WITH_XPU +PD_REGISTER_KERNEL(empty, + XPU, + ALL_LAYOUT, + phi::EmptyKernel, + float, + double, + int8_t, + uint8_t, + int16_t, + int, + int64_t, + bool, + phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/fft_grad_kernel.h b/paddle/phi/kernels/fft_grad_kernel.h new file mode 100644 index 0000000000000..8f5237f1fd08c --- /dev/null +++ b/paddle/phi/kernels/fft_grad_kernel.h @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +template +void FFTC2CGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const std::vector& axes, + const std::string& normalization, + bool forward, + DenseTensor* x_grad); + +template +void FFTC2RGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const std::vector& axes, + const std::string& normalization, + bool forward, + int64_t last_dim_size, + DenseTensor* x_grad); + +template +void FFTR2CGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& axes, + const std::string& normalization, + bool forward, + bool onesided, + DenseTensor* x_grad); +} // namespace phi diff --git a/paddle/phi/kernels/fft_kernel.h b/paddle/phi/kernels/fft_kernel.h new file mode 100644 index 0000000000000..6105ec4d0b3ec --- /dev/null +++ b/paddle/phi/kernels/fft_kernel.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +template +void FFTC2CKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + DenseTensor* out); + +template +void FFTC2RKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + int64_t last_dim_size, + DenseTensor* out); + +template +void FFTR2CKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + bool onesided, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/fill_diagonal_grad_kernel.h b/paddle/phi/kernels/fill_diagonal_grad_kernel.h index 23f2ae577c2cb..e1d55364945e6 100644 --- a/paddle/phi/kernels/fill_diagonal_grad_kernel.h +++ b/paddle/phi/kernels/fill_diagonal_grad_kernel.h @@ -16,8 +16,6 @@ #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/impl/fill_diagonal_kernel_impl.h" - namespace phi { template diff --git a/paddle/phi/kernels/fill_diagonal_kernel.h b/paddle/phi/kernels/fill_diagonal_kernel.h index ecd3ffbe5ccfb..6d4dc78e8240f 100644 --- a/paddle/phi/kernels/fill_diagonal_kernel.h +++ b/paddle/phi/kernels/fill_diagonal_kernel.h @@ -15,9 +15,6 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" - -#include "paddle/phi/kernels/impl/fill_diagonal_kernel_impl.h" - namespace phi { template diff --git a/paddle/phi/kernels/fill_diagonal_tensor_grad_kernel.h b/paddle/phi/kernels/fill_diagonal_tensor_grad_kernel.h new file mode 100644 index 0000000000000..c44d782593d9d --- /dev/null +++ b/paddle/phi/kernels/fill_diagonal_tensor_grad_kernel.h @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void FillDiagonalTensorGradKernel(const Context &ctx, + const DenseTensor &out_grad, + int64_t offset, + int dim1, + int dim2, + DenseTensor *x_grad); + +void CalMatDims(phi::DDim out_dims, + int dim1, + int dim2, + int64_t *offset, + int64_t *new_dims, + int64_t *strides, + int64_t *matoffset); + +} // namespace phi diff --git a/paddle/phi/kernels/fill_diagonal_tensor_kernel.h b/paddle/phi/kernels/fill_diagonal_tensor_kernel.h new file mode 100644 index 0000000000000..9d6c8da93edb5 --- /dev/null +++ b/paddle/phi/kernels/fill_diagonal_tensor_kernel.h @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void FillDiagonalTensorKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + int64_t offset, + int dim1, + int dim2, + DenseTensor* out); + +void CalMatDims(phi::DDim out_dims, + int dim1, + int dim2, + int64_t* offset, + int64_t* new_dims, + int64_t* strides, + int64_t* matoffset); + +} // namespace phi diff --git a/paddle/phi/ops/compat/erfinv_sig.cc b/paddle/phi/kernels/fill_grad_kernel.h similarity index 68% rename from paddle/phi/ops/compat/erfinv_sig.cc rename to paddle/phi/kernels/fill_grad_kernel.h index 37d30aaaeb685..8e43d996489cb 100644 --- a/paddle/phi/ops/compat/erfinv_sig.cc +++ b/paddle/phi/kernels/fill_grad_kernel.h @@ -12,14 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/core/compat/op_utils.h" +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" namespace phi { -KernelSignature ErfinvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("erfinv_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"}); -} +template +void FillGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const Scalar& value, + DenseTensor* in_grad); } // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(erfinv_grad, phi::ErfinvGradOpArgumentMapping); diff --git a/paddle/phi/kernels/impl/fill_diagonal_kernel_impl.h b/paddle/phi/kernels/fill_kernel.h similarity index 75% rename from paddle/phi/kernels/impl/fill_diagonal_kernel_impl.h rename to paddle/phi/kernels/fill_kernel.h index 65383176a0f7a..9af3f465303b3 100644 --- a/paddle/phi/kernels/impl/fill_diagonal_kernel_impl.h +++ b/paddle/phi/kernels/fill_kernel.h @@ -14,19 +14,15 @@ #pragma once +#include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { -inline int64_t CalStride(phi::DDim dim) { - int rank = dim.size(); - int64_t dimsum = 1; - int64_t strides = 0; - for (int i = rank - 1; i >= 0; i--) { - strides += dimsum; - dimsum *= dim[i]; - } - return strides; -} +template +void FillKernel(const Context& dev_ctx, + const DenseTensor& x, + const Scalar& value, + DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/fold_grad_kernel.h b/paddle/phi/kernels/fold_grad_kernel.h new file mode 100644 index 0000000000000..2e8614484aa04 --- /dev/null +++ b/paddle/phi/kernels/fold_grad_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void FoldGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& output_sizes, + const std::vector& kernel_sizes, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + DenseTensor* x_grad); +} diff --git a/paddle/phi/kernels/fold_kernel.h b/paddle/phi/kernels/fold_kernel.h new file mode 100644 index 0000000000000..3fd6281b2cc7f --- /dev/null +++ b/paddle/phi/kernels/fold_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void FoldKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& output_sizes, + const std::vector& kernel_sizes, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + DenseTensor* out); +} diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index 646d65cf8a63e..afa46f1dacaef 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -16,3 +16,20 @@ math_library(pooling DEPS dense_tensor) math_library(segment_pooling) math_library(sequence2batch) math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function) + +if(WITH_GPU OR WITH_ROCM) + if(MKL_FOUND AND WITH_ONEMKL) + math_library(fft spectral_op.cu DEPS dynload_cuda dynload_mklrt + dense_tensor) + target_include_directories(fft PRIVATE ${MKL_INCLUDE}) + else() + math_library(fft spectral_op.cu DEPS dynload_cuda dense_tensor pocketfft) + endif() +else() + if(MKL_FOUND AND WITH_ONEMKL) + mathp_library(fft DEPS dynload_mklrt dense_tensor) + target_include_directories(fft PRIVATE ${MKL_INCLUDE}) + else() + math_library(fft DEPS dense_tensor pocketfft) + endif() +endif() diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 542c59bec1b45..318f2e8b6b3cf 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -1604,7 +1604,11 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor { } static constexpr ActBwdOpFwdDeps FwdDeps() { +#ifdef PADDLE_WITH_MLU + return ActBwdOpFwdDeps::kDepX; +#else return ActBwdOpFwdDeps::kDepOut; +#endif } }; diff --git a/paddle/phi/kernels/affine_grid_impl.h b/paddle/phi/kernels/funcs/affine_grid_utils.h similarity index 98% rename from paddle/phi/kernels/affine_grid_impl.h rename to paddle/phi/kernels/funcs/affine_grid_utils.h index 14c9fa7b56f8c..601b7f1ba6ea8 100644 --- a/paddle/phi/kernels/affine_grid_impl.h +++ b/paddle/phi/kernels/funcs/affine_grid_utils.h @@ -14,7 +14,6 @@ #pragma once -#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h index 139341536debf..2daf8ab0bd97e 100644 --- a/paddle/phi/kernels/funcs/common_shape.h +++ b/paddle/phi/kernels/funcs/common_shape.h @@ -231,5 +231,16 @@ inline DDim GetOutputDims(const DDim &s_dims, const DDim &l_dims) { return phi::make_ddim(shapes); } +inline int64_t CalStride(phi::DDim dim) { + int rank = dim.size(); + int64_t dimsum = 1; + int64_t strides = 0; + for (int i = rank - 1; i >= 0; i--) { + strides += dimsum; + dimsum *= dim[i]; + } + return strides; +} + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/compare_functors.h b/paddle/phi/kernels/funcs/compare_functors.h index 569fed7b7fbab..e16083506bbe0 100644 --- a/paddle/phi/kernels/funcs/compare_functors.h +++ b/paddle/phi/kernels/funcs/compare_functors.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include namespace phi { namespace funcs { @@ -35,6 +36,10 @@ template struct EqualFunctor { HOSTDEVICE OutT operator()(const InT a, const InT b) const { if (std::is_floating_point::value) { + if (isinf(static_cast(a)) || isinf(static_cast(b))) + return static_cast(a == b); + if (isnan(static_cast(a)) || isnan(static_cast(b))) + return static_cast(false); return static_cast(fabs(static_cast(a - b)) < 1e-8); } else { return static_cast(a == b); diff --git a/paddle/phi/kernels/funcs/cufft_util.h b/paddle/phi/kernels/funcs/cufft_util.h new file mode 100644 index 0000000000000..584425c6112a5 --- /dev/null +++ b/paddle/phi/kernels/funcs/cufft_util.h @@ -0,0 +1,160 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/backends/dynload/cufft.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/fft.h" +#include "paddle/phi/kernels/funcs/fft_key.h" + +namespace phi { +namespace funcs { +namespace detail { + +// An RAII encapsulation of cuFFTHandle +class CuFFTHandle { + public: + CuFFTHandle() { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cufftCreate(&handle_)); + } + + CuFFTHandle(const CuFFTHandle& other) = delete; + CuFFTHandle& operator=(const CuFFTHandle& other) = delete; + + CuFFTHandle(CuFFTHandle&& other) = delete; + CuFFTHandle& operator=(CuFFTHandle&& other) = delete; + + ::cufftHandle& get() { return handle_; } + const ::cufftHandle& get() const { return handle_; } + + ~CuFFTHandle() { phi::dynload::cufftDestroy(handle_); } + + private: + ::cufftHandle handle_; +}; + +// Returns true if the transform type has complex input +inline bool has_complex_input(FFTTransformType type) { + switch (type) { + case FFTTransformType::C2C: + case FFTTransformType::C2R: + return true; + + case FFTTransformType::R2C: + return false; + } + PADDLE_THROW(phi::errors::InvalidArgument("Unknown FFTTransformType")); +} + +// Returns true if the transform type has complex output +inline bool has_complex_output(FFTTransformType type) { + switch (type) { + case FFTTransformType::C2C: + case FFTTransformType::R2C: + return true; + + case FFTTransformType::C2R: + return false; + } + PADDLE_THROW(phi::errors::InvalidArgument("Unknown FFTTransformType")); +} + +class FFTConfig { + public: + using plan_size_type = long long int; // NOLINT (be consistent with cufft) + explicit FFTConfig(const FFTConfigKey& key) + : FFTConfig( + std::vector(key.sizes_, key.sizes_ + key.signal_ndim_ + 1), + key.fft_type_, + key.value_type_) {} + // sizes are full signal, including batch size and always two-sided + FFTConfig(const std::vector& sizes, + FFTTransformType fft_type, + DataType precison) + : fft_type_(fft_type), precision_(precison) { + const auto batch_size = static_cast(sizes[0]); + std::vector signal_sizes(sizes.cbegin() + 1, sizes.cend()); + const int signal_ndim = sizes.size() - 1; + + cudaDataType itype, otype, exec_type; + const bool complex_input = has_complex_input(fft_type); + const bool complex_output = has_complex_output(fft_type); + if (precison == DataType::FLOAT32) { + itype = complex_input ? CUDA_C_32F : CUDA_R_32F; + otype = complex_output ? CUDA_C_32F : CUDA_R_32F; + exec_type = CUDA_C_32F; + } else if (precison == DataType::FLOAT64) { + itype = complex_input ? CUDA_C_64F : CUDA_R_64F; + otype = complex_output ? CUDA_C_64F : CUDA_R_64F; + exec_type = CUDA_C_64F; + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "Only transforms of type float32 and float64 are supported.")); + } + + // disable auto allocation of workspace to use allocator from the framework + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cufftSetAutoAllocation(plan(), /* autoAllocate */ 0)); + + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cufftXtMakePlanMany(plan(), + signal_ndim, + signal_sizes.data(), + /* inembed */ nullptr, + /* base_istride */ 1L, + /* idist */ 1L, + itype, + /* onembed */ nullptr, + /* base_ostride */ 1L, + /* odist */ 1L, + otype, + batch_size, + &ws_size_, + exec_type)); + } + + FFTConfig(const FFTConfig& other) = delete; + FFTConfig& operator=(const FFTConfig& other) = delete; + + FFTConfig(FFTConfig&& other) = delete; + FFTConfig& operator=(FFTConfig&& other) = delete; + + const cufftHandle& plan() const { return plan_.get(); } + FFTTransformType transform_type() const { return fft_type_; } + DataType data_type() const { return precision_; } + size_t workspace_size() const { return ws_size_; } + + private: + CuFFTHandle plan_; + size_t ws_size_; // workspace size in bytes + FFTTransformType fft_type_; + DataType precision_; +}; + +// NOTE: R2C is forward-only, C2R is backward only +static void exec_plan(const FFTConfig& config, + void* in_data, + void* out_data, + bool forward) { + auto& plan = config.plan(); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cufftXtExec( + plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE)); +} + +} // namespace detail +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/detection/nms_util.h b/paddle/phi/kernels/funcs/detection/nms_util.h similarity index 84% rename from paddle/fluid/operators/detection/nms_util.h rename to paddle/phi/kernels/funcs/detection/nms_util.h index 527a5c858bd6a..e862b2a90f06c 100644 --- a/paddle/fluid/operators/detection/nms_util.h +++ b/paddle/phi/kernels/funcs/detection/nms_util.h @@ -18,9 +18,11 @@ limitations under the License. */ #include #include "paddle/fluid/operators/detection/poly_util.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" -namespace paddle { -namespace operators { +namespace phi { +namespace funcs { template bool SortScorePairDescend(const std::pair& pair1, @@ -94,9 +96,10 @@ T PolyIoU(const T* box1, const T* box2, const size_t box_size, const bool normalized) { - T bbox1_area = PolyArea(box1, box_size, normalized); - T bbox2_area = PolyArea(box2, box_size, normalized); - T inter_area = PolyOverlapArea(box1, box2, box_size, normalized); + T bbox1_area = paddle::operators::PolyArea(box1, box_size, normalized); + T bbox2_area = paddle::operators::PolyArea(box2, box_size, normalized); + T inter_area = + paddle::operators::PolyOverlapArea(box1, box2, box_size, normalized); if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) { // If coordinate values are invalid // if area size <= 0, return 0. @@ -124,11 +127,12 @@ static inline std::vector> GetSortedScoreIndex( } template -static inline framework::Tensor VectorToTensor( - const std::vector& selected_indices, int selected_num) { - framework::Tensor keep_nms; +static inline DenseTensor VectorToTensor(const DeviceContext& ctx, + const std::vector& selected_indices, + int selected_num) { + DenseTensor keep_nms; keep_nms.Resize({selected_num}); - auto* keep_data = keep_nms.mutable_data(platform::CPUPlace()); + auto* keep_data = ctx.template Alloc(&keep_nms); for (int i = 0; i < selected_num; ++i) { keep_data[i] = selected_indices[i]; } @@ -136,12 +140,12 @@ static inline framework::Tensor VectorToTensor( } template -framework::Tensor NMS(const platform::DeviceContext& ctx, - framework::Tensor* bbox, - framework::Tensor* scores, - T nms_threshold, - float eta, - bool pixel_offset = true) { +DenseTensor NMS(const DeviceContext& ctx, + DenseTensor* bbox, + DenseTensor* scores, + T nms_threshold, + float eta, + bool pixel_offset = true) { int64_t num_boxes = bbox->dims()[0]; // 4: [xmin ymin xmax ymax] int64_t box_size = bbox->dims()[1]; @@ -178,8 +182,8 @@ framework::Tensor NMS(const platform::DeviceContext& ctx, adaptive_threshold *= eta; } } - return VectorToTensor(selected_indices, selected_num); + return VectorToTensor(ctx, selected_indices, selected_num); } -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/fft.cc b/paddle/phi/kernels/funcs/fft.cc new file mode 100644 index 0000000000000..9895ff406cb89 --- /dev/null +++ b/paddle/phi/kernels/funcs/fft.cc @@ -0,0 +1,378 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/funcs/fft.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" +#if defined(PADDLE_WITH_ONEMKL) +#include "paddle/phi/kernels/funcs/mkl_fft_utils.h" +#elif defined(PADDLE_WITH_POCKETFFT) +#define POCKETFFT_CACHE_SIZE 16 +#include "extern_pocketfft/pocketfft_hdronly.h" +#endif + +namespace phi { +namespace funcs { +#if defined(PADDLE_WITH_ONEMKL) + +namespace detail { +// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r) +template +void exec_fft(const phi::CPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + const phi::DDim& in_sizes = x.dims(); + const int ndim = in_sizes.size(); + const int signal_ndim = axes.size(); + const int batch_ndim = ndim - signal_ndim; + const phi::DDim& out_sizes = out->dims(); + + // make a dim permutation + std::vector dim_permute(ndim); + std::iota(dim_permute.begin(), dim_permute.end(), 0); + std::vector is_transformed_dim(ndim, false); + for (const auto& d : axes) { + is_transformed_dim[d] = true; + } + const auto batch_end = + std::partition(dim_permute.begin(), dim_permute.end(), [&](size_t axis) { + return !is_transformed_dim[axis]; + }); + std::copy(axes.cbegin(), axes.cend(), batch_end); + + // transpose input according to the permutation + DenseTensor transposed_input = + Transpose(ctx, x, dim_permute); + const phi::DDim& transposed_input_shape = transposed_input.dims(); + + // batch size + int64_t batch_size = 1L; + for (int i = 0; i < batch_ndim; i++) { + batch_size *= transposed_input_shape[i]; + } + + // make an collapsed input: collapse batch axes for input + std::vector collapsed_input_shape_; + collapsed_input_shape_.reserve(1 + signal_ndim); + collapsed_input_shape_.emplace_back(batch_size); + for (int i = 0; i < signal_ndim; i++) { + collapsed_input_shape_.push_back(in_sizes[axes[i]]); + } + phi::DDim collapsed_input_shape = phi::make_ddim(collapsed_input_shape_); + transposed_input.Resize(collapsed_input_shape); + DenseTensor& collapsed_input = transposed_input; + + // make a collapsed output + phi::DDim transposed_output_shape = out_sizes.transpose(dim_permute); + std::vector collapsed_output_shape_; + collapsed_output_shape_.reserve(1 + signal_ndim); + collapsed_output_shape_.emplace_back(batch_size); + for (int i = 0; i < signal_ndim; i++) { + collapsed_output_shape_.push_back(out_sizes[axes[i]]); + } + phi::DDim collapsed_output_shape = phi::make_ddim(collapsed_output_shape_); + DenseTensor collapsed_output; + collapsed_output.Resize(collapsed_output_shape); + ctx.Alloc(&collapsed_output); + + // make a DFTI_DESCRIPTOR + std::vector signal_sizes(1 + signal_ndim); + signal_sizes[0] = batch_size; + for (int i = 0; i < signal_ndim; i++) { + signal_sizes[1 + i] = + std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]); + } + const phi::DDim input_stride = phi::stride(collapsed_input_shape); + const phi::DDim output_stride = phi::stride(collapsed_output_shape); + + DftiDescriptor desc = plan_mkl_fft(x.dtype(), + out->dtype(), + input_stride, + output_stride, + signal_sizes, + normalization, + forward); + // execute the transform + const FFTTransformType fft_type = GetFFTTransformType(x.dtype(), out->type()); + if (fft_type == FFTTransformType::C2R && forward) { + ConjKernel(ctx, collapsed_input, &collapsed_input); + MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward( + desc.get(), collapsed_input.data(), collapsed_output.data())); + } else if (fft_type == FFTTransformType::R2C && !forward) { + MKL_DFTI_CHECK(phi::dynload::DftiComputeForward( + desc.get(), collapsed_input.data(), collapsed_output.data())); + ConjKernel(ctx, collapsed_output, &collapsed_output); + } else { + if (forward) { + MKL_DFTI_CHECK(phi::dynload::DftiComputeForward( + desc.get(), collapsed_input.data(), collapsed_output.data())); + } else { + MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward( + desc.get(), collapsed_input.data(), collapsed_output.data())); + } + } + + // resize for the collapsed output + collapsed_output.Resize(transposed_output_shape); + phi::DenseTensor& transposed_output = collapsed_output; + + // reverse the transposition + std::vector reverse_dim_permute(ndim); + for (int i = 0; i < ndim; i++) { + reverse_dim_permute[dim_permute[i]] = i; + } + TransposeKernel( + ctx, transposed_output, reverse_dim_permute, out); +} +} // namespace detail + +template +struct FFTC2CFunctor { + void operator()(const phi::CPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + detail::exec_fft(ctx, x, out, axes, normalization, forward); + } +}; + +template +struct FFTR2CFunctor { + void operator()(const phi::CPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + detail::exec_fft(ctx, x, out, axes, normalization, forward); + } +}; + +template +struct FFTC2RFunctor { + void operator()(const phi::CPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + if (axes.size() > 1) { + DenseTensor c2c_result = EmptyLike(ctx, x); + + const std::vector c2c_dims(axes.begin(), axes.end() - 1); + FFTC2CFunctor c2c_functor; + c2c_functor(ctx, x, &c2c_result, c2c_dims, normalization, forward); + + const std::vector new_axes{axes.back()}; + detail::exec_fft( + ctx, c2c_result, out, new_axes, normalization, forward); + } else { + detail::exec_fft(ctx, x, out, axes, normalization, forward); + } + } +}; + +#elif defined(PADDLE_WITH_POCKETFFT) +namespace detail { +template +static T compute_factor(size_t size, FFTNormMode normalization) { + constexpr auto one = static_cast(1); + switch (normalization) { + case FFTNormMode::none: + return one; + case FFTNormMode::by_n: + return one / static_cast(size); + case FFTNormMode::by_sqrt_n: + return one / std::sqrt(static_cast(size)); + } + PADDLE_THROW(phi::errors::InvalidArgument("Unsupported normalization type")); +} +} // namespace detail + +template +struct FFTC2CFunctor { + void operator()(const phi::CPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + using R = typename Ti::value_type; + using C = std::complex; + + const auto& input_dim = x.dims(); + const std::vector in_sizes = phi::vectorize(input_dim); + std::vector in_strides = + phi::vectorize(phi::stride(input_dim)); + const int64_t data_size = sizeof(C); + std::transform(in_strides.begin(), + in_strides.end(), + in_strides.begin(), + [&](std::ptrdiff_t s) { return s * data_size; }); + + const auto* in_data = reinterpret_cast(x.data()); + auto* out_data = reinterpret_cast(out->data()); + // pocketfft requires std::vector + std::vector axes_(axes.size()); + std::copy(axes.begin(), axes.end(), axes_.begin()); + // compuet factor + size_t signal_numel = 1; + for (const auto axis : axes) { + signal_numel *= in_sizes[axis]; + } + R factor = detail::compute_factor(signal_numel, normalization); + pocketfft::c2c(in_sizes, + in_strides, + in_strides, + axes_, + forward, + in_data, + out_data, + factor); + } +}; + +template +struct FFTR2CFunctor { + void operator()(const phi::CPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + using R = Ti; + using C = std::complex; + + const auto& input_dim = x.dims(); + const std::vector in_sizes = phi::vectorize(input_dim); + std::vector in_strides = + phi::vectorize(phi::stride(input_dim)); + { + const int64_t data_size = sizeof(R); + std::transform(in_strides.begin(), + in_strides.end(), + in_strides.begin(), + [&](std::ptrdiff_t s) { return s * data_size; }); + } + + const auto& output_dim = out->dims(); + const std::vector out_sizes = phi::vectorize(output_dim); + std::vector out_strides = + phi::vectorize(phi::stride(output_dim)); + { + const int64_t data_size = sizeof(C); + std::transform(out_strides.begin(), + out_strides.end(), + out_strides.begin(), + [&](std::ptrdiff_t s) { return s * data_size; }); + } + + const auto* in_data = x.data(); + auto* out_data = reinterpret_cast(out->data()); + // pocketfft requires std::vector + std::vector axes_(axes.size()); + std::copy(axes.begin(), axes.end(), axes_.begin()); + // compuet normalization factor + size_t signal_numel = 1; + for (const auto axis : axes) { + signal_numel *= in_sizes[axis]; + } + R factor = detail::compute_factor(signal_numel, normalization); + pocketfft::r2c(in_sizes, + in_strides, + out_strides, + axes_, + forward, + in_data, + out_data, + factor); + } +}; + +template +struct FFTC2RFunctor { + void operator()(const phi::CPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + using R = To; + using C = std::complex; + + const auto& input_dim = x.dims(); + const std::vector in_sizes = phi::vectorize(input_dim); + std::vector in_strides = + phi::vectorize(phi::stride(input_dim)); + { + const int64_t data_size = sizeof(C); + std::transform(in_strides.begin(), + in_strides.end(), + in_strides.begin(), + [&](std::ptrdiff_t s) { return s * data_size; }); + } + + const auto& output_dim = out->dims(); + const std::vector out_sizes = phi::vectorize(output_dim); + std::vector out_strides = + phi::vectorize(phi::stride(output_dim)); + { + const int64_t data_size = sizeof(R); + std::transform(out_strides.begin(), + out_strides.end(), + out_strides.begin(), + [&](std::ptrdiff_t s) { return s * data_size; }); + } + + const auto* in_data = reinterpret_cast(x.data()); + auto* out_data = out->data(); + // pocketfft requires std::vector + std::vector axes_(axes.size()); + std::copy(axes.begin(), axes.end(), axes_.begin()); + // compuet normalization factor + size_t signal_numel = 1; + for (const auto axis : axes) { + signal_numel *= out_sizes[axis]; + } + R factor = detail::compute_factor(signal_numel, normalization); + pocketfft::c2r(out_sizes, + in_strides, + out_strides, + axes_, + forward, + in_data, + out_data, + factor); + } +}; +#endif + +using complex64_t = phi::dtype::complex; +using complex128_t = phi::dtype::complex; +template struct FFTC2CFunctor; +template struct FFTC2CFunctor; +template struct FFTC2RFunctor; +template struct FFTC2RFunctor; +template struct FFTR2CFunctor; +template struct FFTR2CFunctor; +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/fft.cu b/paddle/phi/kernels/funcs/fft.cu new file mode 100644 index 0000000000000..edac497bc8e8b --- /dev/null +++ b/paddle/phi/kernels/funcs/fft.cu @@ -0,0 +1,346 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/kernels/funcs/fft.h" +#include "paddle/phi/kernels/funcs/fft_cache.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/assign_kernel.h" +#include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/scale_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" + +namespace phi { +namespace funcs { +namespace detail { + +// Use the optimized path to perform single R2C or C2R if transformation dim is +// supported by cuFFT +static bool use_optimized_fft_path(const std::vector& axes) { + // For performance reason, when axes starts with (0, 1), do not use the + // optimized path. + if (axes.size() > kMaxFFTNdim || + (axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) { + return false; + } else { + return true; + } +} + +static double fft_normalization_scale(FFTNormMode normalization, + const std::vector& sizes, + const std::vector& dims) { + // auto norm = static_cast(normalization); + if (normalization == FFTNormMode::none) { + return static_cast(1.0); + } + + int64_t signal_numel = 1; + for (auto dim : dims) { + signal_numel *= sizes[dim]; + } + const double scale_denom = (normalization == FFTNormMode::by_sqrt_n) + ? std::sqrt(signal_numel) + : static_cast(signal_numel); + return static_cast(1.0 / scale_denom); +} + +template +void exec_normalization(const phi::GPUContext& ctx, + const DenseTensor& in, + DenseTensor* out, + FFTNormMode normalization, + const std::vector& sizes, + const std::vector& axes) { + const double scale = fft_normalization_scale(normalization, sizes, axes); + if (scale != 1.0) { + ScaleKernel(ctx, in, scale, 0, true, out); + } else { + AssignKernel(ctx, in, out); + } +} + +bool has_large_prime_factor(int64_t n) { + constexpr int64_t first_large_prime = 11; + const std::array prime_radices{{2, 3, 5, 7}}; + for (auto prime : prime_radices) { + if (n < first_large_prime) { + return false; + } + while (n % prime == 0) { + n /= prime; + } + } + return n != 1; +} + +#if defined(PADDLE_WITH_CUDA) +inline bool use_cache(const int64_t* signal_size) { + bool using_cache = true; + int cufft_version; + phi::dynload::cufftGetVersion(&cufft_version); + if (10300 <= cufft_version && cufft_version <= 10400) { + using_cache = std::none_of( + signal_size + 1, signal_size + kMaxDataNdim, [](int64_t dim_size) { + return has_large_prime_factor(dim_size); + }); + } + return using_cache; +} +#elif defined(PADDLE_WITH_HIP) +inline bool use_cache(const int64_t* signal_size) { return true; } +#endif + +// up to 3d unnormalized fft transform (c2r, r2c, c2c) +template +void exec_fft(const phi::GPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + bool forward) { + const phi::DDim& in_sizes = x.dims(); + const int ndim = in_sizes.size(); + const int signal_ndim = axes.size(); + const int batch_ndim = ndim - signal_ndim; + const phi::DDim& out_sizes = out->dims(); + + // make a dim permutation + std::vector dim_permute(ndim); + std::iota(dim_permute.begin(), dim_permute.end(), 0); + std::vector is_transformed_dim(ndim, false); + for (const auto& d : axes) { + is_transformed_dim[d] = true; + } + const auto batch_end = + std::partition(dim_permute.begin(), dim_permute.end(), [&](size_t axis) { + return !is_transformed_dim[axis]; + }); + std::copy(axes.cbegin(), axes.cend(), batch_end); + + // transpose input according to the permutation + DenseTensor transposed_input = + Transpose(ctx, x, dim_permute); + const phi::DDim transposed_input_shape = transposed_input.dims(); + + // batch size + int64_t batch_size = 1L; + for (int i = 0; i < batch_ndim; i++) { + batch_size *= transposed_input_shape[i]; + } + + // make an collapsed input: collapse batch axes for input + std::vector collapsed_input_shape_; + collapsed_input_shape_.reserve(1 + signal_ndim); + collapsed_input_shape_.emplace_back(batch_size); + for (int i = 0; i < signal_ndim; i++) { + collapsed_input_shape_.push_back(in_sizes[axes[i]]); + } + phi::DDim collapsed_input_shape = phi::make_ddim(collapsed_input_shape_); + transposed_input.Resize(collapsed_input_shape); + DenseTensor& collapsed_input = transposed_input; + + // make a collapsed output + phi::DDim transposed_output_shape = out_sizes.transpose(dim_permute); + std::vector collapsed_output_shape_; + collapsed_output_shape_.reserve(1 + signal_ndim); + collapsed_output_shape_.emplace_back(batch_size); + for (int i = 0; i < signal_ndim; i++) { + collapsed_output_shape_.push_back(out_sizes[axes[i]]); + } + phi::DDim collapsed_output_shape = phi::make_ddim(collapsed_output_shape_); + DenseTensor collapsed_output; + collapsed_output.Resize(collapsed_output_shape); + ctx.Alloc(&collapsed_output); + + FFTConfigKey key = + create_fft_configkey(collapsed_input, collapsed_output, signal_ndim); + int64_t device_id = ctx.GetPlace().GetDeviceId(); + FFTConfig* config = nullptr; + std::unique_ptr config_ = nullptr; + bool using_cache = use_cache(key.sizes_); + + if (using_cache) { + FFTConfigCache& plan_cache = get_fft_plan_cache(device_id); + std::unique_lock guard(plan_cache.mutex, std::defer_lock); + guard.lock(); + config = &(plan_cache.lookup(key)); + } else { + config_ = std::make_unique(key); + config = config_.get(); + } + + const int64_t workspace_size = static_cast(config->workspace_size()); + DenseTensor workspace_tensor = Empty(ctx, {workspace_size}); + + // prepare cufft for execution +#if defined(PADDLE_WITH_CUDA) + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cufftSetStream(config->plan(), ctx.stream())); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cufftSetWorkArea(config->plan(), workspace_tensor.data())); +#elif defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::hipfftSetStream(config->plan(), ctx.stream())); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::hipfftSetWorkArea(config->plan(), workspace_tensor.data())); +#endif + + // execution of fft plan + const FFTTransformType fft_type = config->transform_type(); + if (fft_type == FFTTransformType::C2R && forward) { + ConjKernel(ctx, collapsed_input, &collapsed_input); + exec_plan(*config, collapsed_input.data(), collapsed_output.data(), false); + } else if (fft_type == FFTTransformType::R2C && !forward) { + exec_plan(*config, collapsed_input.data(), collapsed_output.data(), true); + ConjKernel(ctx, collapsed_output, &collapsed_output); + } else { + exec_plan( + *config, collapsed_input.data(), collapsed_output.data(), forward); + } + + // resize for the collapsed output + collapsed_output.Resize(transposed_output_shape); + phi::DenseTensor& transposed_output = collapsed_output; + + // reverse the transposition + std::vector reverse_dim_permute(ndim); + for (int i = 0; i < ndim; i++) { + reverse_dim_permute[dim_permute[i]] = i; + } + TransposeKernel( + ctx, transposed_output, reverse_dim_permute, out); +} +} // namespace detail + +template +struct FFTC2CFunctor { + void operator()(const phi::GPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + if (axes.empty()) { + AssignKernel(ctx, x, out); + return; + } + + std::vector working_axes = axes; + std::sort(working_axes.begin(), working_axes.end()); + std::vector first_dims; + size_t max_dims; + + DenseTensor working_tensor = x; // shallow copy + while (true) { + max_dims = std::min(static_cast(detail::kMaxFFTNdim), + working_axes.size()); + first_dims.assign(working_axes.end() - max_dims, working_axes.end()); + + detail::exec_fft(ctx, working_tensor, out, first_dims, forward); + working_axes.resize(working_axes.size() - max_dims); + first_dims.clear(); + + if (working_axes.empty()) { + break; + } + + if (working_tensor.IsSharedWith(x)) { + working_tensor = std::move(*out); + *out = EmptyLike(ctx, x); + } else { + std::swap(*out, working_tensor); + } + } + + std::vector out_dims = phi::vectorize(x.dims()); + detail::exec_normalization( + ctx, *out, out, normalization, out_dims, axes); + } +}; + +template +struct FFTC2RFunctor { + void operator()(const phi::GPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + std::vector out_dims = phi::vectorize(out->dims()); + + if (detail::use_optimized_fft_path(axes)) { + DenseTensor x_copy = Assign(ctx, x); + detail::exec_fft(ctx, x_copy, out, axes, forward); + } else { + DenseTensor c2c_result = EmptyLike(ctx, x); + FFTC2CFunctor c2c_functor; + c2c_functor(ctx, + x, + &c2c_result, + {axes.begin(), axes.end() - 1}, + FFTNormMode::none, + forward); + detail::exec_fft(ctx, c2c_result, out, {axes.back()}, forward); + } + detail::exec_normalization( + ctx, *out, out, normalization, out_dims, axes); + } +}; + +template +struct FFTR2CFunctor { + void operator()(const phi::GPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward) { + if (detail::use_optimized_fft_path(axes)) { + detail::exec_fft(ctx, x, out, axes, forward); + } else { + DenseTensor r2c_result = EmptyLike(ctx, *out); + detail::exec_fft(ctx, x, &r2c_result, {axes.back()}, forward); + + FFTC2CFunctor fft_c2c_func; + fft_c2c_func(ctx, + r2c_result, + out, + {axes.begin(), axes.end() - 1}, + FFTNormMode::none, + forward); + } + + const auto in_dims = phi::vectorize(x.dims()); + detail::exec_normalization( + ctx, *out, out, normalization, in_dims, axes); + } +}; + +using complex64_t = phi::dtype::complex; +using complex128_t = phi::dtype::complex; +template struct FFTC2CFunctor; +template struct FFTC2CFunctor; +template struct FFTC2RFunctor; +template struct FFTC2RFunctor; +template struct FFTR2CFunctor; +template struct FFTR2CFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/fft.h b/paddle/phi/kernels/funcs/fft.h new file mode 100644 index 0000000000000..3f9e1191ebb3e --- /dev/null +++ b/paddle/phi/kernels/funcs/fft.h @@ -0,0 +1,103 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/utils/data_type.h" + +namespace phi { +namespace funcs { + +enum class FFTNormMode : int8_t { + none, // No normalization + by_sqrt_n, // Divide by sqrt(signal_size) + by_n, // Divide by signal_size +}; + +inline FFTNormMode get_norm_from_string(const std::string& norm, bool forward) { + if (norm.empty() || norm == "backward") { + return forward ? FFTNormMode::none : FFTNormMode::by_n; + } + + if (norm == "forward") { + return forward ? FFTNormMode::by_n : FFTNormMode::none; + } + + if (norm == "ortho") { + return FFTNormMode::by_sqrt_n; + } + + PADDLE_THROW(phi::errors::InvalidArgument( + "FFT norm string must be 'forward' or 'backward' or 'ortho', " + "received %s", + norm)); +} + +enum class FFTTransformType : int8_t { + C2C = 0, // Complex-to-complex + R2C, // Real-to-complex + C2R, // Complex-to-real +}; + +// Create transform type enum from bools representing if input and output are +// complex +inline FFTTransformType GetFFTTransformType(DataType input_dtype, + DataType output_dtype) { + auto complex_input = IsComplexType(input_dtype); + auto complex_output = IsComplexType(output_dtype); + if (complex_input && complex_output) { + return FFTTransformType::C2C; + } else if (complex_input && !complex_output) { + return FFTTransformType::C2R; + } else if (!complex_input && complex_output) { + return FFTTransformType::R2C; + } + PADDLE_THROW( + phi::errors::InvalidArgument("Real to real FFTs are not supported")); +} + +template +struct FFTC2CFunctor { + void operator()(const DeviceContext& ctx, + const DenseTensor& X, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward); +}; + +template +struct FFTR2CFunctor { + void operator()(const DeviceContext& ctx, + const DenseTensor& X, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward); +}; + +template +struct FFTC2RFunctor { + void operator()(const DeviceContext& ctx, + const DenseTensor& X, + DenseTensor* out, + const std::vector& axes, + FFTNormMode normalization, + bool forward); +}; +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/fft_cache.h b/paddle/phi/kernels/funcs/fft_cache.h new file mode 100644 index 0000000000000..51e90a6c0d95b --- /dev/null +++ b/paddle/phi/kernels/funcs/fft_cache.h @@ -0,0 +1,189 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#if defined(PADDLE_WITH_CUDA) +#include "paddle/phi/kernels/funcs/cufft_util.h" +#elif defined(PADDLE_WITH_HIP) +#include "paddle/phi/kernels/funcs/hipfft_util.h" +#endif + +namespace phi { +namespace funcs { +namespace detail { + +#if CUDA_VERSION < 10000 +// Note that the max plan number for CUDA version < 10 has to be 1023 +// due to a bug that fails on the 1024th plan +constexpr size_t CUFFT_MAX_PLAN_NUM = 1023; +constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM; +#else +constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits::max(); +// The default max cache size chosen for CUDA version > 10 is arbitrary. +// This number puts a limit on how big of a plan cache should we maintain by +// default. Users can always configure it via cufft_set_plan_cache_max_size. +constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096; +#endif + +static_assert(CUFFT_MAX_PLAN_NUM >= 0 && + CUFFT_MAX_PLAN_NUM <= std::numeric_limits::max(), + "CUFFT_MAX_PLAN_NUM not in size_t range"); +static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 && + CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM, + "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range"); + +class FFTConfigCache { + public: + using kv_t = typename std::pair; + using map_t = + typename std::unordered_map, + typename std::list::iterator, + KeyHash, + KeyEqual>; + using map_kkv_iter_t = typename map_t::iterator; + + FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {} + + explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); } + + FFTConfigCache(const FFTConfigCache& other) = delete; + FFTConfigCache& operator=(const FFTConfigCache& other) = delete; + + FFTConfigCache(FFTConfigCache&& other) noexcept + : _usage_list(std::move(other._usage_list)), + _cache_map(std::move(other._cache_map)), + _max_size(other._max_size) {} + + FFTConfigCache& operator=(FFTConfigCache&& other) noexcept { + _usage_list = std::move(other._usage_list); + _cache_map = std::move(other._cache_map); + _max_size = other._max_size; + return *this; + } + + // If key is in this cache, return the cached config. Otherwise, emplace the + // config in this cache and return it. + FFTConfig& lookup(FFTConfigKey params) { + PADDLE_ENFORCE_GT(_max_size, + 0, + phi::errors::InvalidArgument( + "The max size of FFTConfigCache must be great than 0," + "But received is [%d]", + _max_size)); + + map_kkv_iter_t map_it = _cache_map.find(params); + // Hit, put to list front + if (map_it != _cache_map.end()) { + _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second); + return map_it->second->second; + } + + // Miss + // remove if needed + if (_usage_list.size() >= _max_size) { + auto last = _usage_list.end(); + last--; + _cache_map.erase(last->first); + _usage_list.pop_back(); + } + + // construct new plan at list front, then insert into _cache_map + _usage_list.emplace_front(std::piecewise_construct, + std::forward_as_tuple(params), + std::forward_as_tuple(params)); + auto kv_it = _usage_list.begin(); + _cache_map.emplace(std::piecewise_construct, + std::forward_as_tuple(kv_it->first), + std::forward_as_tuple(kv_it)); + return kv_it->second; + } + + void clear() { + _cache_map.clear(); + _usage_list.clear(); + } + + void resize(int64_t new_size) { + _set_max_size(new_size); + auto cur_size = _usage_list.size(); + if (cur_size > _max_size) { + auto delete_it = _usage_list.end(); + for (size_t i = 0; i < cur_size - _max_size; i++) { + delete_it--; + _cache_map.erase(delete_it->first); + } + _usage_list.erase(delete_it, _usage_list.end()); + } + } + + size_t size() const { return _cache_map.size(); } + + size_t max_size() const noexcept { return _max_size; } + + std::mutex mutex; + + private: + // Only sets size and does value check. Does not resize the data structures. + void _set_max_size(int64_t new_size) { + // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since + // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check + // first. + PADDLE_ENFORCE_GE( + new_size, + 0, + phi::errors::InvalidArgument( + "cuFFT plan cache size must be non-negative, But received is [%d]", + new_size)); + PADDLE_ENFORCE_LE(new_size, + CUFFT_MAX_PLAN_NUM, + phi::errors::InvalidArgument( + "cuFFT plan cache size can not be larger than [%d], " + "But received is [%d]", + CUFFT_MAX_PLAN_NUM, + new_size)); + _max_size = static_cast(new_size); + } + + std::list _usage_list; + map_t _cache_map; + size_t _max_size; +}; + +static std::vector> plan_caches; +static std::mutex plan_caches_mutex; + +static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) { + std::lock_guard guard(plan_caches_mutex); + + if (device_index >= plan_caches.size()) { + plan_caches.resize(device_index + 1); + } + + if (!plan_caches[device_index]) { + plan_caches[device_index] = std::make_unique(); + } + + return *plan_caches[device_index]; +} +} // namespace detail +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/fft_fill_conj.h b/paddle/phi/kernels/funcs/fft_fill_conj.h new file mode 100644 index 0000000000000..91d859020f88b --- /dev/null +++ b/paddle/phi/kernels/funcs/fft_fill_conj.h @@ -0,0 +1,219 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#if defined(__NVCC__) || defined(__HIPCC__) +#include "thrust/device_vector.h" +#endif + +namespace phi { +namespace funcs { + +// Giving a linear destination index and strides of tensor, get_idx return the +// corresponding linear position of source tensor. +// The linear index is the position of flatten tensor. +// Giving a linear destination index and strides of tensor, get_idx return the +// corresponding linear position of source tensor. +// The linear index is the position of flatten tensor. +HOSTDEVICE inline int64_t get_src_idx(const int64_t dst_idx, + const int64_t* dst_strides, + const int64_t* dst_shape, + const int64_t* src_strides, + const bool* is_fft_axis, + const bool conj, + const int64_t rank) { + int64_t src_idx = 0; + int64_t quotient = dst_idx; + int64_t remainder = 0; + + for (int64_t i = 0; i < rank; i++) { + remainder = quotient % dst_strides[i]; + quotient = quotient / dst_strides[i]; + if (conj && is_fft_axis[i]) { + src_idx += ((dst_shape[i] - quotient) % dst_shape[i]) * src_strides[i]; + } else { + src_idx += src_strides[i] * quotient; + } + quotient = remainder; + } + + return src_idx; +} + +HOSTDEVICE inline bool is_conj_part(const int64_t dst_idx, + const int64_t* dst_strides, + const int64_t last_axis, + const int64_t last_axis_size) { + int64_t quotient = dst_idx; + int64_t remainder = 0; + + for (int64_t i = 0; i < last_axis + 1; i++) { + remainder = quotient % dst_strides[i]; + quotient = quotient / dst_strides[i]; + + if ((i == last_axis) && (quotient > last_axis_size - 1)) { + return true; + } + + quotient = remainder; + } + + return false; +} + +// FFTFillConjFunctor fill the destination tensor with source tensor and +// conjugate symmetry element of source tensor . +// Use framework::ForRange to iterate destination element with +// supporting different device +template +struct FFTFillConjFunctor { + FFTFillConjFunctor(const C* src_data, + C* dst_data, + const int64_t* src_strides, + const int64_t* dst_strides, + const int64_t* dst_shape, + const bool* is_fft_axis, + const int64_t last_axis, + const int64_t last_axis_size, + const int64_t rank) + : src_data_(src_data), + dst_data_(dst_data), + src_strides_(src_strides), + dst_strides_(dst_strides), + dst_shape_(dst_shape), + is_fft_axis_(is_fft_axis), + last_axis_(last_axis), + last_axis_size_(last_axis_size), + rank_(rank) {} + HOSTDEVICE void operator()(int64_t dst_idx) { + if (is_conj_part(dst_idx, dst_strides_, last_axis_, last_axis_size_)) { + const auto conj_idx = get_src_idx(dst_idx, + dst_strides_, + dst_shape_, + src_strides_, + is_fft_axis_, + true, + rank_); + auto src_value = src_data_[conj_idx]; + auto conj_value = C(src_value.real, -src_value.imag); + dst_data_[dst_idx] = conj_value; + } else { + const auto copy_idx = get_src_idx(dst_idx, + dst_strides_, + dst_shape_, + src_strides_, + is_fft_axis_, + false, + rank_); + dst_data_[dst_idx] = src_data_[copy_idx]; + } + } + + const C* src_data_; + C* dst_data_; + const int64_t* src_strides_; + const int64_t* dst_strides_; + const int64_t* dst_shape_; + const bool* is_fft_axis_; + const int64_t last_axis_; + const int64_t last_axis_size_; + const int64_t rank_; +}; + +template +void FFTFillConj(const DeviceContext& ctx, + const DenseTensor* src, + DenseTensor* dst, + const std::vector& axes) { + std::vector src_strides_v = + phi::vectorize(phi::stride(src->dims())); + std::vector dst_strides_v = + phi::vectorize(phi::stride(dst->dims())); + std::vector dst_shape_v = phi::vectorize(dst->dims()); + const auto src_data = src->data(); + auto dst_data = dst->data(); + const auto last_axis = axes.back(); + const auto last_axis_size = dst->dims().at(last_axis) / 2 + 1; + const int64_t rank = dst->dims().size(); + auto _is_fft_axis = std::make_unique(rank); + for (const auto i : axes) { + _is_fft_axis[i] = true; + } + +#if defined(__NVCC__) || defined(__HIPCC__) + const thrust::device_vector src_strides_g(src_strides_v); + const auto src_strides = thrust::raw_pointer_cast(src_strides_g.data()); + const thrust::device_vector dst_strides_g(dst_strides_v); + const auto dst_strides = thrust::raw_pointer_cast(dst_strides_g.data()); + const thrust::device_vector dst_shape_g(dst_shape_v); + const auto dst_shape = thrust::raw_pointer_cast(dst_shape_g.data()); + const thrust::device_vector is_fft_axis_g(_is_fft_axis.get(), + _is_fft_axis.get() + rank); + const auto p_is_fft_axis = thrust::raw_pointer_cast(is_fft_axis_g.data()); +#else + const auto src_strides = src_strides_v.data(); + const auto dst_strides = dst_strides_v.data(); + const auto dst_shape = dst_shape_v.data(); + const auto p_is_fft_axis = _is_fft_axis.get(); +#endif + ForRange for_range(ctx, dst->numel()); + FFTFillConjFunctor fill_conj_functor(src_data, + dst_data, + src_strides, + dst_strides, + dst_shape, + p_is_fft_axis, + last_axis, + last_axis_size, + rank); + for_range(fill_conj_functor); +} + +template +struct FFTFillConjGradFunctor { + T* input_; + const size_t axis_; + const int64_t* strides_; + const size_t double_length_; + + FFTFillConjGradFunctor(T* input, + size_t axis, + const int64_t* strides, + size_t double_length) + : input_(input), + axis_(axis), + strides_(strides), + double_length_(double_length) {} + + HOSTDEVICE void operator()(size_t index) { + size_t offtset = index; // back + size_t index_i; + for (size_t i = 0; i <= axis_; i++) { + index_i = offtset / strides_[i]; + offtset %= strides_[i]; + } + + if ((0 < index_i) && (index_i < double_length_ + 1)) { + input_[index] *= static_cast(2); + } + } +}; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/fft_key.h b/paddle/phi/kernels/funcs/fft_key.h new file mode 100644 index 0000000000000..5893cfc6ba019 --- /dev/null +++ b/paddle/phi/kernels/funcs/fft_key.h @@ -0,0 +1,115 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/fft.h" + +namespace phi { +namespace funcs { +namespace detail { + +const int64_t kMaxFFTNdim = 3; +const int64_t kMaxDataNdim = kMaxFFTNdim + 1; + +struct FFTConfigKey { + int signal_ndim_; // 1 <= signal_ndim <= kMaxFFTNdim + // These include additional batch dimension as well. + int64_t sizes_[kMaxDataNdim]; + int64_t input_shape_[kMaxDataNdim]; + int64_t output_shape_[kMaxDataNdim]; + FFTTransformType fft_type_; + DataType value_type_; + + using shape_t = std::vector; + FFTConfigKey() = default; + + FFTConfigKey(const shape_t& in_shape, + const shape_t& out_shape, + const shape_t& signal_size, + FFTTransformType fft_type, + DataType value_type) { + // Padding bits must be zeroed for hashing + memset(this, 0, sizeof(*this)); + signal_ndim_ = signal_size.size() - 1; + fft_type_ = fft_type; + value_type_ = value_type; + std::copy(signal_size.cbegin(), signal_size.cend(), sizes_); + std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_); + std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_); + } +}; + +// Hashing machinery for Key +// Fowler–Noll–Vo hash function +// see +// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function +template +struct KeyHash { + // Key must be a POD because we read out its memory + // contenst as char* when hashing + static_assert(std::is_pod::value, "Key must be plain old data type"); + + size_t operator()(const Key& params) const { + auto ptr = reinterpret_cast(¶ms); + uint32_t value = 0x811C9DC5; + for (int i = 0; i < static_cast(sizeof(Key)); ++i) { + value ^= ptr[i]; + value *= 0x01000193; + } + return static_cast(value); + } +}; + +template +struct KeyEqual { + // Key must be a POD because we read out its memory + // contenst as char* when comparing + static_assert(std::is_pod::value, "Key must be plain old data type"); + + bool operator()(const Key& a, const Key& b) const { + auto ptr1 = reinterpret_cast(&a); + auto ptr2 = reinterpret_cast(&b); + return memcmp(ptr1, ptr2, sizeof(Key)) == 0; + } +}; + +static FFTConfigKey create_fft_configkey(const DenseTensor& input, + const DenseTensor& output, + int signal_ndim) { + // Create the transform plan (either from cache or locally) + DataType input_dtype = input.dtype(); + const auto value_type = + IsComplexType(input_dtype) ? ToRealType(input_dtype) : input_dtype; + const auto fft_type = GetFFTTransformType(input.dtype(), output.dtype()); + // signal sizes + std::vector signal_size(signal_ndim + 1); + + signal_size[0] = input.dims()[0]; + for (int64_t i = 1; i <= signal_ndim; ++i) { + auto in_size = input.dims()[i]; + auto out_size = output.dims()[i]; + signal_size[i] = std::max(in_size, out_size); + } + FFTConfigKey key(phi::vectorize(input.dims()), + phi::vectorize(output.dims()), + signal_size, + fft_type, + value_type); + return key; +} + +} // namespace detail +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/hipfft_util.h b/paddle/phi/kernels/funcs/hipfft_util.h new file mode 100644 index 0000000000000..6583a97f17a1d --- /dev/null +++ b/paddle/phi/kernels/funcs/hipfft_util.h @@ -0,0 +1,184 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/backends/dynload/hipfft.h" +#include "paddle/phi/kernels/funcs/fft.h" +#include "paddle/phi/kernels/funcs/fft_key.h" + +namespace phi { +namespace funcs { +namespace detail { + +// An RAII encapsulation of hipFFTHandle +class HIPFFTHandle { + public: + HIPFFTHandle() { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftCreate(&handle_)); + } + + HIPFFTHandle(const HIPFFTHandle& other) = delete; + HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete; + + HIPFFTHandle(HIPFFTHandle&& other) = delete; + HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete; + + ::hipfftHandle& get() { return handle_; } + const ::hipfftHandle& get() const { return handle_; } + + ~HIPFFTHandle() { phi::dynload::hipfftDestroy(handle_); } + + private: + ::hipfftHandle handle_; +}; + +class FFTConfig { + public: + using plan_size_type = int; + explicit FFTConfig(const FFTConfigKey& key) + : FFTConfig( + std::vector(key.sizes_, key.sizes_ + key.signal_ndim_ + 1), + key.fft_type_, + key.value_type_) {} + FFTConfig(const std::vector& sizes, + FFTTransformType fft_type, + DataType precision) + : fft_type_(fft_type), precision_(precision) { + std::vector signal_sizes(sizes.begin() + 1, sizes.end()); + const auto batch_size = static_cast(sizes[0]); + const int signal_ndim = sizes.size() - 1; + + hipfftType exec_type = [&]() { + if (precision == DataType::FLOAT32) { + switch (fft_type) { + case FFTTransformType::C2C: + return HIPFFT_C2C; + case FFTTransformType::R2C: + return HIPFFT_R2C; + case FFTTransformType::C2R: + return HIPFFT_C2R; + } + } else if (precision == DataType::FLOAT64) { + switch (fft_type) { + case FFTTransformType::C2C: + return HIPFFT_Z2Z; + case FFTTransformType::R2C: + return HIPFFT_D2Z; + case FFTTransformType::C2R: + return HIPFFT_Z2D; + } + } + PADDLE_THROW(phi::errors::InvalidArgument( + "Only transforms of type float32 and float64 are supported.")); + }(); + + // disable auto allocation of workspace to use allocator from the framework + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::hipfftSetAutoAllocation(plan(), /* autoAllocate */ 0)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::hipfftMakePlanMany(plan(), + signal_ndim, + signal_sizes.data(), + /* inembed */ nullptr, + /* base_istride */ 1, + /* idist */ 1, + /* onembed */ nullptr, + /* base_ostride */ 1, + /* odist */ 1, + exec_type, + batch_size, + &ws_size_)); + } + + const hipfftHandle& plan() const { return plan_.get(); } + FFTTransformType transform_type() const { return fft_type_; } + DataType data_type() const { return precision_; } + size_t workspace_size() const { return ws_size_; } + + private: + HIPFFTHandle plan_; + size_t ws_size_; // workspace size in bytes + FFTTransformType fft_type_; + DataType precision_; +}; + +// NOTE: R2C is forward-only, C2R is backward only +static void exec_plan(const FFTConfig& config, + void* in_data, + void* out_data, + bool forward) { + const hipfftHandle& plan = config.plan(); + + DataType value_type = config.data_type(); + if (value_type == DataType::FLOAT32) { + switch (config.transform_type()) { + case FFTTransformType::C2C: { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftExecC2C( + plan, + static_cast(in_data), + static_cast(out_data), + forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); + return; + } + case FFTTransformType::R2C: { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::hipfftExecR2C(plan, + static_cast(in_data), + static_cast(out_data))); + return; + } + case FFTTransformType::C2R: { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::hipfftExecC2R(plan, + static_cast(in_data), + static_cast(out_data))); + return; + } + } + } else if (value_type == DataType::FLOAT64) { + switch (config.transform_type()) { + case FFTTransformType::C2C: { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftExecZ2Z( + plan, + static_cast(in_data), + static_cast(out_data), + forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); + return; + } + case FFTTransformType::R2C: { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftExecD2Z( + plan, + static_cast(in_data), + static_cast(out_data))); + return; + } + case FFTTransformType::C2R: { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::hipfftExecZ2D( + plan, + static_cast(in_data), + static_cast(out_data))); + return; + } + } + } + PADDLE_THROW(phi::errors::InvalidArgument( + "hipFFT only support transforms of type float32 and float64")); +} + +} // namespace detail +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/lamb_functors.h b/paddle/phi/kernels/funcs/lamb_functors.h new file mode 100644 index 0000000000000..5abc86bfb777c --- /dev/null +++ b/paddle/phi/kernels/funcs/lamb_functors.h @@ -0,0 +1,463 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include // for sqrt in CPU and CUDA + +#include +#include + +#include "paddle/fluid/memory/buffer.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/algorithm.h" +#include "paddle/phi/kernels/funcs/eigen/extensions.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/squared_l2_norm.h" +#include "paddle/phi/kernels/funcs/tensor_to_string.h" + +namespace phi { + +namespace scatter = paddle::operators::math::scatter; + +template +struct LambMomentREGUpdateFunctor { + using MT = + typename std::conditional::Type, + T>::type; + + MT weight_decay_; + MT beta1_; + MT beta2_; + MT epsilon_; + + MT beta1_pow_; + MT* beta1_pow_out_; + MT beta2_pow_; + MT* beta2_pow_out_; + const MT* moment1_; + MT* moment1_out_; + const MT* moment2_; + MT* moment2_out_; + const T* grad_; + const MT* param_; + MT* trust_ratio_div_; + const bool* skip_update_; + + LambMomentREGUpdateFunctor(MT weight_decay, + MT beta1, + MT beta2, + MT epsilon, + MT beta1_pow, + MT beta2_pow, + const MT* mom1, + MT* mom1_out, + const MT* mom2, + MT* mom2_out, + const T* grad, + const MT* param, + MT* trust_ratio_div, + const bool* skip_update) + : weight_decay_(weight_decay), + beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + grad_(grad), + param_(param), + trust_ratio_div_(trust_ratio_div), + skip_update_(skip_update) {} + + inline HOSTDEVICE void operator()(size_t i) const { + if (skip_update_ && *skip_update_) return; + + MT g = static_cast(grad_[i]); + MT mom1 = moment1_[i]; + MT mom2 = moment2_[i]; + MT beta1_pow = beta1_pow_; + MT beta2_pow = beta2_pow_; + MT p = param_[i]; + + mom1 = beta1_ * mom1 + (static_cast(1) - beta1_) * g; + mom2 = beta2_ * mom2 + (static_cast(1) - beta2_) * g * g; + + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + + MT mom1_unbiased = mom1 / (static_cast(1) - beta1_pow); + MT mom2_unbiased = mom2 / (static_cast(1) - beta2_pow); + trust_ratio_div_[i] = + mom1_unbiased / (Eigen::numext::sqrt(mom2_unbiased) + epsilon_) + + weight_decay_ * p; + } +}; + +template +struct LambMomentMENUpdateFunctor { + using MT = + typename std::conditional::Type, + T>::type; + + MT weight_decay_; + MT beta1_; + MT beta2_; + MT epsilon_; + + const MT* beta1_pow_; + const MT* beta2_pow_; + const MT* moment1_; + MT* moment1_out_; + const MT* moment2_; + MT* moment2_out_; + const T* grad_; + const MT* param_; + MT* trust_ratio_div_; + const bool* skip_update_; + + LambMomentMENUpdateFunctor(MT weight_decay, + MT beta1, + MT beta2, + MT epsilon, + const MT* beta1_pow, + const MT* beta2_pow, + const MT* mom1, + MT* mom1_out, + const MT* mom2, + MT* mom2_out, + const T* grad, + const MT* param, + MT* trust_ratio_div, + const bool* skip_update) + : weight_decay_(weight_decay), + beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + grad_(grad), + param_(param), + trust_ratio_div_(trust_ratio_div), + skip_update_(skip_update) {} + + inline HOSTDEVICE void operator()(size_t i) const { + if (skip_update_ && *skip_update_) return; + MT g = static_cast(grad_[i]); + MT mom1 = moment1_[i]; + MT mom2 = moment2_[i]; + MT beta1_pow = *beta1_pow_; + MT beta2_pow = *beta2_pow_; + MT p = param_[i]; + + mom1 = beta1_ * mom1 + (static_cast(1) - beta1_) * g; + mom2 = beta2_ * mom2 + (static_cast(1) - beta2_) * g * g; + + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + + MT mom1_unbiased = mom1 / (static_cast(1) - beta1_pow); + MT mom2_unbiased = mom2 / (static_cast(1) - beta2_pow); + trust_ratio_div_[i] = + mom1_unbiased / (Eigen::numext::sqrt(mom2_unbiased) + epsilon_) + + weight_decay_ * p; + } +}; + +template +struct SparseLambMomentREGUpdateFunctor { + T weight_decay_; + T beta1_; + T beta2_; + T epsilon_; + + T beta1_pow_; + T beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* grad_; + const T* param_; + T* trust_ratio_div_; + + const int64_t* rows_; + int64_t row_numel_; + int64_t row_count_; + + const bool* skip_update_; + + SparseLambMomentREGUpdateFunctor(T weight_decay, + T beta1, + T beta2, + T epsilon, + T beta1_pow, + T beta2_pow, + const T* mom1, + T* mom1_out, + const T* mom2, + T* mom2_out, + const T* grad, + const T* param, + T* trust_ratio_div, + const int64_t* rows, + int64_t row_numel, + int64_t row_count, + const bool* skip_update) + : weight_decay_(weight_decay), + beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + grad_(grad), + param_(param), + trust_ratio_div_(trust_ratio_div), + rows_(rows), + row_numel_(row_numel), + row_count_(row_count), + skip_update_(skip_update) {} + + inline HOSTDEVICE void update(size_t i, T g) const { + // The following code is same as dense + T mom1 = moment1_[i]; + T mom2 = moment2_[i]; + T beta1_pow = beta1_pow_; + T beta2_pow = beta2_pow_; + T p = param_[i]; + + mom1 = beta1_ * mom1 + (static_cast(1) - beta1_) * g; + mom2 = beta2_ * mom2 + (static_cast(1) - beta2_) * g * g; + + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + + T mom1_unbiased = mom1 / (static_cast(1) - beta1_pow); + T mom2_unbiased = mom2 / (static_cast(1) - beta2_pow); + trust_ratio_div_[i] = + mom1_unbiased / (Eigen::numext::sqrt(mom2_unbiased) + epsilon_) + + weight_decay_ * p; + } + + inline HOSTDEVICE void operator()(size_t i) const { + if (skip_update_ && *skip_update_) return; + auto row_idx = + phi::funcs::BinarySearch(rows_, row_count_, i / row_numel_); + T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] + : static_cast(0); + update(i, g); + } +}; + +template +struct SparseLambMomentMENUpdateFunctor { + T weight_decay_; + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* grad_; + const T* param_; + T* trust_ratio_div_; + + const int64_t* rows_; + int64_t row_numel_; + int64_t row_count_; + + const bool* skip_update_; + + SparseLambMomentMENUpdateFunctor(T weight_decay, + T beta1, + T beta2, + T epsilon, + const T* beta1_pow, + const T* beta2_pow, + const T* mom1, + T* mom1_out, + const T* mom2, + T* mom2_out, + const T* grad, + const T* param, + T* trust_ratio_div, + const int64_t* rows, + int64_t row_numel, + int64_t row_count, + const bool* skip_update) + : weight_decay_(weight_decay), + beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + grad_(grad), + param_(param), + trust_ratio_div_(trust_ratio_div), + rows_(rows), + row_numel_(row_numel), + row_count_(row_count), + skip_update_(skip_update) {} + + inline HOSTDEVICE void update(size_t i, T g) const { + // The following code is same as dense + T mom1 = moment1_[i]; + T mom2 = moment2_[i]; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + T p = param_[i]; + + mom1 = beta1_ * mom1 + (static_cast(1) - beta1_) * g; + mom2 = beta2_ * mom2 + (static_cast(1) - beta2_) * g * g; + + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + + T mom1_unbiased = mom1 / (static_cast(1) - beta1_pow); + T mom2_unbiased = mom2 / (static_cast(1) - beta2_pow); + trust_ratio_div_[i] = + mom1_unbiased / (Eigen::numext::sqrt(mom2_unbiased) + epsilon_) + + weight_decay_ * p; + } + + inline HOSTDEVICE void operator()(size_t i) const { + if (skip_update_ && *skip_update_) return; + auto row_idx = + phi::funcs::BinarySearch(rows_, row_count_, i / row_numel_); + T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] + : static_cast(0); + update(i, g); + } +}; + +template +struct LambBetaPowUpdateFunctor { + void SetBetaPows(const MT* beta1pow, + const MT* beta2pow, + MT* beta1pow_out, + MT* beta2pow_out, + MT beta1, + MT beta2) { + beta1pow_ = beta1pow; + beta2pow_ = beta2pow; + beta1pow_out_ = beta1pow_out; + beta2pow_out_ = beta2pow_out; + beta1_ = beta1; + beta2_ = beta2; + } + + HOSTDEVICE void UpdateBetaPow(size_t i) const { + if (i == 0) { + beta1pow_out_[0] = beta1pow_[0] * beta1_; + beta2pow_out_[0] = beta2pow_[0] * beta2_; + } + } + + private: + const MT* beta1pow_; + const MT* beta2pow_; + MT* beta1pow_out_; + MT* beta2pow_out_; + MT beta1_; + MT beta2_; +}; + +template +struct LambBetaPowUpdateFunctor { + void SetBetaPows(const MT* beta1pow, + const MT* beta2pow, + MT* beta1pow_out, + MT* beta2pow_out, + MT beta1, + MT beta2) {} + HOSTDEVICE void UpdateBetaPow(size_t) const {} +}; + +template +struct LambParamUpateFunctor + : public LambBetaPowUpdateFunctor { + const MT* lr_; + const T* param_; + const MT* master_param_; + const MT* param_norm_; + const MT* trust_ratio_div_; + const MT* trust_ratio_div_norm_; + T* param_out_; + MT* master_param_out_; + + const bool* skip_update_; + + LambParamUpateFunctor(const MT* lr, + const T* param, + const MT* master_param, + const MT* param_norm, + const MT* trust_ratio_div, + const MT* trust_ratio_div_norm, + T* param_out, + MT* master_param_out, + const bool* skip_update) + : lr_(lr), + param_(param), + master_param_(master_param), + param_norm_(param_norm), + trust_ratio_div_(trust_ratio_div), + trust_ratio_div_norm_(trust_ratio_div_norm), + param_out_(param_out), + master_param_out_(master_param_out), + skip_update_(skip_update) {} + + inline HOSTDEVICE void operator()(size_t i) const { + if (skip_update_ && *skip_update_) return; + MT lr = *lr_; + MT pn = Eigen::numext::sqrt(*param_norm_); + MT tn = Eigen::numext::sqrt(*trust_ratio_div_norm_); + + MT r = (pn > static_cast(0) && tn > static_cast(0)) + ? pn / tn + : static_cast(1); + lr *= r; + MT p = IsMultiPrecision ? master_param_[i] : static_cast(param_[i]); + MT param_out = p - lr * trust_ratio_div_[i]; + param_out_[i] = static_cast(param_out); + if (IsMultiPrecision) { + master_param_out_[i] = param_out; + } + this->UpdateBetaPow(i); + } +}; + +} // namespace phi diff --git a/paddle/phi/kernels/funcs/mkl_fft_utils.h b/paddle/phi/kernels/funcs/mkl_fft_utils.h new file mode 100644 index 0000000000000..dbc0678ab7ae5 --- /dev/null +++ b/paddle/phi/kernels/funcs/mkl_fft_utils.h @@ -0,0 +1,172 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/phi/backends/dynload/mklrt.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/kernels/funcs/fft.h" + +namespace phi { +namespace funcs { +namespace detail { + +#define MKL_DFTI_CHECK(expr) \ + do { \ + MKL_LONG status = (expr); \ + if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR)) \ + PADDLE_THROW( \ + phi::errors::External(phi::dynload::DftiErrorMessage(status))); \ + } while (0); + +struct DftiDescriptorDeleter { + void operator()(DFTI_DESCRIPTOR_HANDLE handle) { + if (handle != nullptr) { + MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle)); + } + } +}; + +// A RAII wrapper for MKL_DESCRIPTOR* +class DftiDescriptor { + public: + void init(DFTI_CONFIG_VALUE precision, + DFTI_CONFIG_VALUE signal_type, + MKL_LONG signal_ndim, + MKL_LONG* sizes) { + PADDLE_ENFORCE_EQ(desc_.get(), + nullptr, + phi::errors::AlreadyExists( + "DftiDescriptor has already been initialized.")); + + DFTI_DESCRIPTOR* raw_desc; + MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX( + &raw_desc, precision, signal_type, signal_ndim, sizes)); + desc_.reset(raw_desc); + } + + DFTI_DESCRIPTOR* get() const { + DFTI_DESCRIPTOR* raw_desc = desc_.get(); + PADDLE_ENFORCE_NOT_NULL(raw_desc, + phi::errors::PreconditionNotMet( + "DFTI DESCRIPTOR has not been initialized.")); + return raw_desc; + } + + private: + std::unique_ptr desc_; +}; + +static DftiDescriptor plan_mkl_fft(const DataType in_dtype, + const DataType out_dtype, + const phi::DDim& in_strides, + const phi::DDim& out_strides, + const std::vector& signal_sizes, + FFTNormMode normalization, + bool forward) { + const DFTI_CONFIG_VALUE precision = [&] { + switch (in_dtype) { + case DataType::FLOAT32: + return DFTI_SINGLE; + case DataType::COMPLEX64: + return DFTI_SINGLE; + case DataType::FLOAT64: + return DFTI_DOUBLE; + case DataType::COMPLEX128: + return DFTI_DOUBLE; + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "Invalid input datatype (%s), input data type should be FP32, " + "FP64, COMPLEX64 or COMPLEX128.", + in_dtype)); + } + }(); + + // C2C, R2C, C2R + const FFTTransformType fft_type = GetFFTTransformType(in_dtype, out_dtype); + const DFTI_CONFIG_VALUE domain = + (fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL; + + DftiDescriptor descriptor; + std::vector fft_sizes(signal_sizes.cbegin(), signal_sizes.cend()); + const MKL_LONG signal_ndim = fft_sizes.size() - 1; + descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1); + + // placement inplace or not inplace + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( + descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE)); + + // number of transformations + const MKL_LONG batch_size = fft_sizes[0]; + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( + descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size)); + + // input & output distance + const MKL_LONG idist = in_strides[0]; + const MKL_LONG odist = out_strides[0]; + MKL_DFTI_CHECK( + phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist)); + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( + descriptor.get(), DFTI_OUTPUT_DISTANCE, odist)); + + // input & output stride + std::vector mkl_in_stride(1 + signal_ndim, 0); + std::vector mkl_out_stride(1 + signal_ndim, 0); + for (MKL_LONG i = 1; i <= signal_ndim; i++) { + mkl_in_stride[i] = in_strides[i]; + mkl_out_stride[i] = out_strides[i]; + } + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( + descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data())); + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( + descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data())); + + // conjugate even storage + if (!(fft_type == FFTTransformType::C2C)) { + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( + descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX)); + } + + MKL_LONG signal_numel = std::accumulate(fft_sizes.cbegin() + 1, + fft_sizes.cend(), + 1UL, + std::multiplies()); + if (normalization != FFTNormMode::none) { + const double scale = + ((normalization == FFTNormMode::by_sqrt_n) + ? 1.0 / std::sqrt(static_cast(signal_numel)) + : 1.0 / static_cast(signal_numel)); + const auto scale_direction = [&]() { + if (fft_type == FFTTransformType::R2C || + (fft_type == FFTTransformType::C2C && forward)) { + return DFTI_FORWARD_SCALE; + } else { + // (fft_type == FFTTransformType::C2R || + // (fft_type == FFTTransformType::C2C && !forward)) + return DFTI_BACKWARD_SCALE; + } + }(); + MKL_DFTI_CHECK( + phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale)); + } + + // commit the descriptor + MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get())); + return descriptor; +} + +} // namespace detail +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/onednn/mkldnn_reuse.h b/paddle/phi/kernels/funcs/onednn/mkldnn_reuse.h new file mode 100644 index 0000000000000..96333132508c4 --- /dev/null +++ b/paddle/phi/kernels/funcs/onednn/mkldnn_reuse.h @@ -0,0 +1,301 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/phi/backends/onednn/onednn_context.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +namespace funcs { + +using user_function = std::function(const float*)>; +using memory = dnnl::memory; +using Place = phi::Place; + +template +class MKLDNNHandlerNoCachingT { + public: + MKLDNNHandlerNoCachingT(dnnl::engine engine, Place cpu_place) + : engine_(engine), place_(cpu_place), fwd_pd_(nullptr), bwd_pd_(nullptr) { + phi::OneDNNContext::tls().log_lib_version(); + } + + std::shared_ptr AcquireForwardPrimitive() { + return std::make_shared(*fwd_pd_); + } + + std::shared_ptr AcquireBackwardPrimitive() { + return std::make_shared(*bwd_pd_); + } + + std::shared_ptr AcquireBackwardWeightsPrimitive() { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + phi::errors::Unavailable("BWD_PD should be set when " + "getting BWD prim .")); + return std::make_shared(*bwd_w_pd_); + } + + std::shared_ptr AcquireSrcMemory(const DenseTensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive( + fwd_pd_->src_desc(), paddle::platform::to_void_cast(input_data)); + } + + template + std::shared_ptr AcquireDstMemory(DenseTensor* output) { + T_out* ptr = + output->mutable_data(place_, fwd_pd_->dst_desc().get_size()); + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr); + } + + template + std::shared_ptr AcquireDstMemory(void) { + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc()); + } + + template + std::shared_ptr AcquireDstMemory(const DenseTensor* output) { + const T_out* output_data = output->data(); + return this->AcquireMemoryFromPrimitive( + bwd_pd_->dst_desc(), + paddle::platform::to_void_cast(output_data)); + } + + std::shared_ptr AcquireDiffDstMemory( + const DenseTensor* diffdst) { + const T* ptr = diffdst->data(); + return this->AcquireMemoryFromPrimitive( + bwd_pd_->diff_dst_desc(), paddle::platform::to_void_cast(ptr)); + } + + std::shared_ptr AcquireDiffSrcMemory(DenseTensor* diffsrc) { + T* ptr = + diffsrc->mutable_data(place_, bwd_pd_->diff_src_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_desc(), ptr); + } + + // Buffer of given Tensor is used for oneDNN computation + std::shared_ptr AcquireDiffWeightsMemory( + DenseTensor* diff_weights) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + phi::errors::Unavailable( + "BWD_W_PD should be set when getting BWD grad of weights.")); + T* ptr = diff_weights->mutable_data( + place_, bwd_w_pd_->diff_weights_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), + ptr); + } + + // Buffer is allocated by oneDNN to store computation results + std::shared_ptr AcquireDiffWeightsMemory(void) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + phi::errors::Unavailable( + "BWD_W_PD should be set when getting BWD grad of weights.")); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc()); + } + + protected: + // If your primitive descriptor requires attributes, pass them as a + // first argument and paramters to descriptor constructor in the following + // arguments. Otherwise, all arguments will be forwarded to descriptor + // constructor, including the first one. + template + void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) { + CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); + } + + // Using sfinae to specialise variadic function. Workaround for not having + // if constexpr in C++ 11. + template + typename std::enable_if::type, + dnnl::primitive_attr>::value>::type + CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { + auto fwd_desc = typename TForward::desc(std::forward(args)...); + fwd_pd_ = std::make_shared( + fwd_desc, first, engine_); + } + + template + typename std::enable_if::type, + dnnl::primitive_attr>::value>::type + CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { + auto fwd_desc = typename TForward::desc(std::forward(first), + std::forward(args)...); + fwd_pd_ = + std::make_shared(fwd_desc, engine_); + } + + template + void AcquireBackwardPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL( + fwd_pd_, + phi::errors::Unavailable("Get MKLDNN Forward primitive %s failed.")); + auto bwd_desc = typename TBackward::desc(std::forward(args)...); + bwd_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + } + + template + void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL( + fwd_pd_, + phi::errors::Unavailable("Get MKLDNN Forward primitive %s failed.")); + auto bwd_desc = + typename TBackward_params::desc(std::forward(args)...); + bwd_w_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + dnnl::memory::desc md, void* ptr) { + return std::make_shared(md, engine_, ptr); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + dnnl::memory::desc md) { + return std::make_shared(md, engine_); + } + + void AcquireReorder(const std::shared_ptr& user_memory_p, + const std::shared_ptr& target_memory_p) { + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + + auto& astream = phi::OneDNNContext::tls().get_stream(); + + paddle::platform::RecordEvent record_reorder( + "int_reorder", + paddle::platform::TracerEventType::UserDefined, + 2, + paddle::platform::EventRole::kUniqueOp); + reorder_p->execute( + astream, + {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); + astream.wait(); + } + + template + std::shared_ptr AcquireMemoryWithReorder( + const dnnl::memory::desc& user_md, + const dnnl::memory::desc& target_md, + void* ptr, + bool is_persistent = false, + std::function(const F*)> custom_reorder_func = {}) { + std::shared_ptr target_memory_p; + if (custom_reorder_func) { + auto reordered_data = + custom_reorder_func(reinterpret_cast(ptr)); + ptr = reinterpret_cast(reordered_data.get()); + } + auto user_memory_p = std::make_shared(user_md, engine_, ptr); + if (user_md != target_md) { + target_memory_p = std::make_shared(target_md, engine_); + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + + auto& astream = phi::OneDNNContext::tls().get_stream(); + paddle::platform::RecordEvent record_reorder( + "int_reorder", + paddle::platform::TracerEventType::UserDefined, + 2, + paddle::platform::EventRole::kUniqueOp); + reorder_p->execute( + astream, + {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); + astream.wait(); + } else { + target_memory_p = user_memory_p; + } + return target_memory_p; + } + + dnnl::engine engine_; + Place place_; + std::shared_ptr fwd_pd_; + std::shared_ptr bwd_pd_; + std::shared_ptr bwd_w_pd_; +}; + +template +class ActivationMKLDNNHandler + : public MKLDNNHandlerNoCachingT { + public: + ActivationMKLDNNHandler(dnnl::algorithm algorithm, + float alpha, + float beta, + const dnnl::engine engine, + Place cpu_place, + const DenseTensor* x) + : MKLDNNHandlerNoCachingT(engine, cpu_place) { + this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training, + algorithm, + x->mem_desc(), + alpha, + beta); + } + + ActivationMKLDNNHandler(dnnl::algorithm algorithm, + float alpha, + float beta, + const dnnl::engine engine, + Place cpu_place, + const DenseTensor* x, + const DenseTensor* dout) + : MKLDNNHandlerNoCachingT(engine, cpu_place) { + this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training, + algorithm, + x->mem_desc(), + alpha, + beta); + this->AcquireBackwardPrimitiveDescriptor( + algorithm, dout->mem_desc(), x->mem_desc(), alpha, beta); + } + + std::shared_ptr AcquireBackwardSrcMemory( + const DenseTensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive( + this->bwd_pd_->src_desc(), + paddle::platform::to_void_cast(input_data)); + } +}; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h index f27174d581818..6c293b2394443 100644 --- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h +++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h @@ -79,6 +79,7 @@ __global__ void ScatterKernelV2(const T* input, const int* index_groups, const int non_zero_num, const int kernel_size, + const int max_voxel, const int channels, const int buffer_counts, T* out) { @@ -96,10 +97,11 @@ __global__ void ScatterKernelV2(const T* input, &sums); for (int it = 0; it < buffer_counts; it++) { int len = index_counts[indices_i + it * non_zero_num]; - const int group_offset = it * kernel_size * non_zero_num; + const int group_offset = it * max_voxel * kernel_size * non_zero_num; for (int j = 0; j < len; j++) { const int out_feature_i = - index_groups[indices_i * kernel_size + j + group_offset]; + index_groups[indices_i * max_voxel * kernel_size + j + + group_offset]; LoadT vec_in; phi::Load( input + out_feature_i * channels + channels_i * VecSize, &vec_in); @@ -121,6 +123,7 @@ void ScatterV2(const GPUContext& dev_ctx, const int* index_groups, const int non_zero_num, const int kernel_size, + const int max_voxel, const int channels, const int buffer_counts, T* output) { @@ -136,6 +139,7 @@ void ScatterV2(const GPUContext& dev_ctx, index_groups, non_zero_num, kernel_size, + max_voxel, channels, buffer_counts, output); @@ -150,6 +154,7 @@ void ScatterV2(const GPUContext& dev_ctx, index_groups, non_zero_num, kernel_size, + max_voxel, channels, buffer_counts, output); diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index 0458f0d83ed1a..9ced02fcb690c 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -174,7 +174,7 @@ class CuSparseSpMatDescriptor { explicit CuSparseSpMatDescriptor(const phi::SparseCsrTensor& x, const phi::GPUContext& dev_ctx) : dev_ctx_(dev_ctx) { - PD_VISIT_INTEGRAL_TYPES( + PD_VISIT_BASE_INTEGRAL_TYPES( x.non_zero_crows().dtype(), "Csr CuSparseSpMatDescriptor", ([&] { CreateCsrDescriptor(x, dev_ctx_, &descriptor_); })); @@ -184,7 +184,7 @@ class CuSparseSpMatDescriptor { explicit CuSparseSpMatDescriptor(const phi::SparseCooTensor& x, const phi::GPUContext& dev_ctx) : dev_ctx_(dev_ctx) { - PD_VISIT_INTEGRAL_TYPES( + PD_VISIT_BASE_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "Coo CuSparseSpMatDescriptor", ([&] { CreateCooDescriptor(x, dev_ctx_, &descriptor_); })); diff --git a/paddle/fluid/operators/tensor_to_string.h b/paddle/phi/kernels/funcs/tensor_to_string.h similarity index 66% rename from paddle/fluid/operators/tensor_to_string.h rename to paddle/phi/kernels/funcs/tensor_to_string.h index ef8a041fc5adc..2f1fb574930f3 100644 --- a/paddle/fluid/operators/tensor_to_string.h +++ b/paddle/phi/kernels/funcs/tensor_to_string.h @@ -16,13 +16,14 @@ #include -#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/string/string_helper.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/string/string_helper.h" -namespace paddle { -namespace operators { +namespace phi { +namespace funcs { template static const std::vector &ToVector(const std::vector &vec) { @@ -30,22 +31,20 @@ static const std::vector &ToVector(const std::vector &vec) { } template -static std::vector ToVector(const T *x, - size_t n, - const platform::Place &place) { +static std::vector ToVector(const T *x, size_t n, const phi::Place &place) { #ifdef __NVCC__ - if (platform::is_gpu_place(place)) { + if (paddle::platform::is_gpu_place(place)) { using CopyT = typename std:: conditional::value, uint8_t, T>::type; std::vector cpu_x(n); auto *dev_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(place)); - memory::Copy(platform::CPUPlace(), - cpu_x.data(), - place, - x, - n * sizeof(T), - dev_ctx->stream()); + phi::DeviceContextPool::Instance().Get(place)); + paddle::memory::Copy(phi::CPUPlace(), + cpu_x.data(), + place, + x, + n * sizeof(T), + dev_ctx->stream()); dev_ctx->Wait(); return std::vector(cpu_x.data(), cpu_x.data() + n); } @@ -54,7 +53,7 @@ static std::vector ToVector(const T *x, } template -static std::vector ToVector(const framework::Tensor &src) { +static std::vector ToVector(const DenseTensor &src) { if (!src.IsInitialized()) { return {}; } @@ -64,8 +63,8 @@ static std::vector ToVector(const framework::Tensor &src) { template static std::string FlattenToString(Args &&...args) { const auto &vec = ToVector(std::forward(args)...); - return "[" + string::join_strings(vec, ',') + "]"; + return "[" + paddle::string::join_strings(vec, ',') + "]"; } -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/generate_proposals_v2_kernel.h b/paddle/phi/kernels/generate_proposals_v2_kernel.h new file mode 100644 index 0000000000000..c2fc2677039f9 --- /dev/null +++ b/paddle/phi/kernels/generate_proposals_v2_kernel.h @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GenerateProposalsV2Kernel(const Context& ctx, + const DenseTensor& scores, + const DenseTensor& bbox_deltas, + const DenseTensor& im_shape, + const DenseTensor& anchors, + const DenseTensor& variances, + int pre_nms_top_n, + int post_nms_top_n, + float nms_thresh, + float min_size, + float eta, + bool pixel_offset, + DenseTensor* rpn_rois, + DenseTensor* rpn_roi_probs, + DenseTensor* rpn_rois_num); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu index baa2616267d53..b2cb0f2ad7319 100644 --- a/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu @@ -15,6 +15,7 @@ #pragma once #include "paddle/phi/kernels/affine_grid_grad_kernel.h" + #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" @@ -22,6 +23,7 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/affine_grid_utils.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/affine_grid_kernel.cu b/paddle/phi/kernels/gpu/affine_grid_kernel.cu index ad5072f4bacd1..4e5c326be7b5e 100644 --- a/paddle/phi/kernels/gpu/affine_grid_kernel.cu +++ b/paddle/phi/kernels/gpu/affine_grid_kernel.cu @@ -15,6 +15,7 @@ #pragma once #include "paddle/phi/kernels/affine_grid_kernel.h" + #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" @@ -22,6 +23,7 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/affine_grid_utils.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 8731946ba2d42..5c6fd04c15e68 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -691,6 +691,9 @@ void BatchNormKernel(const Context &ctx, auto handle = ctx.cudnn_handle(); + const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 10240; + const size_t CUDNN_SPATIAL_THRESHOLD = 880801; + // Now, depending on whether we are running test or not, we have two paths. // It is training mode when it's not reference AND not using pre-trained // model. @@ -793,23 +796,58 @@ void BatchNormKernel(const Context &ctx, // est_var->template data>())), // epsilon)); #else - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnBatchNormalizationForwardInference( - handle, - // Note: PERSISTENT not implemented for inference - CUDNN_BATCHNORM_SPATIAL, - CudnnDataType::kOne(), - CudnnDataType::kZero(), - data_desc_, - transformed_x.template data(), - data_desc_, - ctx.template Alloc(&transformed_y), - bn_param_desc_, - scale.template data>(), - bias.template data>(), - est_mean->template data>(), - est_var->template data>(), - epsilon)); + const bool use_native_kernel = + ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) || + (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD)); + if (use_native_kernel) { + const int block_size = 256; + const int grid_size = (N * C * H * W * D + block_size - 1) / block_size; + if (compute_format == DataLayout::kNCHW) { + BNForwardInference + <<>>( + transformed_x.template data(), + est_mean->template data>(), + est_var->template data>(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + transformed_y.template data()); + } else { + BNForwardInference + <<>>( + transformed_x.template data(), + est_mean->template data>(), + est_var->template data>(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + transformed_y.template data()); + } + } else { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationForwardInference( + handle, + // Note: PERSISTENT not implemented for inference + CUDNN_BATCHNORM_SPATIAL, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + transformed_x.template data(), + data_desc_, + ctx.template Alloc(&transformed_y), + bn_param_desc_, + scale.template data>(), + bias.template data>(), + est_mean->template data>(), + est_var->template data>(), + epsilon)); + } #endif } else { // if MomentumTensor is set, use MomentumTensor value, momentum @@ -909,8 +947,6 @@ void BatchNormKernel(const Context &ctx, // BatchNormParamType>(ctx.GetPlace())))); #else // const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070; - const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 10240; - const size_t CUDNN_SPATIAL_THRESHOLD = 880801; const bool use_native_kernel = ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) || (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD)); @@ -1190,7 +1226,16 @@ PD_REGISTER_KERNEL(batch_norm, ALL_LAYOUT, phi::BatchNormKernel, float, - phi::dtype::float16) {} + phi::dtype::float16) { + kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); +} #else PD_REGISTER_KERNEL(batch_norm, GPU, @@ -1200,6 +1245,10 @@ PD_REGISTER_KERNEL(batch_norm, double, phi::dtype::float16) { if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32); kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); diff --git a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu new file mode 100644 index 0000000000000..eb92a4488e502 --- /dev/null +++ b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu @@ -0,0 +1,598 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_HIP +#include +#include + +#include +typedef hiprandState curandState; +namespace cub = hipcub; +#else +#include +#include + +#include +#endif + +#include +#include + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/enforce.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#endif +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +#define CUDA_KERNEL_LOOP(i, n) \ + for (int32_t i = blockIdx.x * blockDim.x + threadIdx.x, \ + step = blockDim.x * gridDim.x; \ + i < (n); \ + i += step) + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +inline int32_t NumBlocks(const int32_t n) { + return std::min((n + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +__global__ void RandomSampleClassCenter(const int64_t n, + int64_t seed, + int64_t increment, + const int64_t max_val, + T* buffer) { + const int id = blockIdx.x * blockDim.x + threadIdx.x; + curandState localState; + size_t local_seed = + (static_cast(seed) + 0x9E3779B9U + + (static_cast(id) << 6U) + (static_cast(id) >> 2U)); +#ifdef PADDLE_WITH_HIP + hiprand_init(local_seed, id, increment, &localState); + CUDA_KERNEL_LOOP(i, n) { + buffer[i] = static_cast(hiprand(&localState) % max_val); + } +#else + curand_init(local_seed, id, increment, &localState); + CUDA_KERNEL_LOOP(i, n) { + buffer[i] = static_cast(curand(&localState) % max_val); + } +#endif +} + +template +__global__ void Range(const int64_t n, T* out) { + CUDA_KERNEL_LOOP(i, n) { out[i] = static_cast(i); } +} + +template +__global__ void MarkPositiveClassCenter(const int64_t n, + const int64_t rank, + const T* class_interval_ptr, + const int num_classes, + const T* labels, + T* out) { + CUDA_KERNEL_LOOP(i, n) { + T label = labels[i] - class_interval_ptr[rank]; + if (label >= 0 && label < num_classes) { + out[label] = label - num_classes; + } + } +} + +template +__device__ void FindIntervalIndex(const T* class_interval_ptr, + const int64_t nranks, + const T value, + int64_t* find_index) { + int64_t start = 0; + int64_t end = nranks; + int64_t mid = ((end - start) >> 1) + start + 1; + while (start < end) { + if (class_interval_ptr[mid] == value) break; + if (class_interval_ptr[mid] > value) + end = mid - 1; + else + start = mid; + mid = ((end - start) >> 1) + start + 1; + } + *find_index = min(mid, end); +} + +template +__global__ void GetClassCenterBound(const int64_t n, + const int64_t nranks, + const T* class_interval_ptr, + const T* key_ptr, + const T* value_ptr, + T* bound_index, + T* bound_value) { + CUDA_KERNEL_LOOP(i, n) { + if (i != 0) { + int64_t cur_index, pre_index; + FindIntervalIndex(class_interval_ptr, nranks, key_ptr[i], &cur_index); + FindIntervalIndex(class_interval_ptr, nranks, key_ptr[i - 1], &pre_index); + if (cur_index > pre_index) { + assert(cur_index < nranks); +#pragma unroll + for (int32_t j = pre_index + 1; j <= cur_index; ++j) { + bound_index[j] = static_cast(i); + bound_value[j] = value_ptr[i]; + } + } + } + } + CUDA_KERNEL_LOOP(i, nranks + 1) { + int64_t first_index, last_index; + FindIntervalIndex(class_interval_ptr, nranks, key_ptr[0], &first_index); + FindIntervalIndex(class_interval_ptr, nranks, key_ptr[n - 1], &last_index); + if (i <= first_index) { + bound_index[i] = 0; + bound_value[i] = value_ptr[0]; + } else if (i > last_index) { + bound_index[i] = n; + bound_value[i] = value_ptr[n - 1] + 1; + } + } +} + +template +__global__ void GetRemappedLabel(const int64_t n, + const int64_t nranks, + const T* sampled_class_interval_ptr, + const T* bound_index, + const T* bound_value, + const T* label_map_key, + T* label_map_value, + T* mapped_label) { + CUDA_KERNEL_LOOP(i, n) { +#pragma unroll + for (int64_t j = 0; j < nranks; j++) { + if (i >= bound_index[j] && i < bound_index[j + 1]) { + label_map_value[i] = + label_map_value[i] - bound_value[j] + sampled_class_interval_ptr[j]; + } + } + mapped_label[label_map_key[i]] = label_map_value[i]; + } +} + +// aligned vector generates vectorized load/store on CUDA +template +struct alignas(sizeof(T) * Size) AlignedVector { + T val[Size]; +}; + +template +inline int VectorizedSize(const T* pointer) { + uint64_t address = reinterpret_cast(pointer); + constexpr int vec4 = std::alignment_of>::value; // NOLINT + if (address % vec4 == 0) { + return 4; + } + return 1; +} + +#undef CUDA_KERNEL_LOOP + +template +class NotEqualToPreviousAdjacentIterator { + public: + using self_type = NotEqualToPreviousAdjacentIterator; + using value_type = T; + using difference_type = std::ptrdiff_t; + using pointer = T*; + using reference = T; + using iterator_category = std::input_iterator_tag; + + public: + __host__ __device__ __forceinline__ + NotEqualToPreviousAdjacentIterator(const T* arr, int64_t offset) + : arr_(arr), offset_(offset) {} + + __host__ __device__ __forceinline__ reference operator*() const { + return offset_ == 0 ? 0 : (arr_[offset_] == arr_[offset_ - 1] ? 0 : 1); + } + + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const { + self_type ret(arr_, offset_ + n); + return ret; + } + + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const { + self_type ret(arr_, offset_ - n); + return ret; + } + + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const { + return *(*this + n); + } + + private: + const T* arr_; + int64_t offset_; +}; + +template +struct ActualNumSampledFunctor { + __host__ __device__ __forceinline__ T operator()(const T& a, + const T& b) const { + return max(num_samples, (b - a)); + } + T num_samples; + explicit ActualNumSampledFunctor(const T num) : num_samples(num) {} +}; + +template +class MemoryBuffer { + public: + MemoryBuffer(const int num_buffer_ele, + const int num_temp_ele, + const int nranks, + const Context& dev_ctx) { + offset1 = 0; + offset2 = offset1 + num_buffer_ele; + offset3 = offset2 + num_buffer_ele; + offset4 = offset3 + num_buffer_ele; + offset5 = offset4 + num_buffer_ele; + offset6 = offset5 + (nranks + 1); + offset7 = offset6 + (nranks + 1); + offset8 = offset7 + (nranks + 1); + offset9 = offset8 + num_temp_ele; + + buffer.Resize({4 * num_buffer_ele + 3 * (nranks + 1) + num_temp_ele}); + buffer_ptr = dev_ctx.template Alloc(&buffer); + } + + T* cub_sort_keys_ptr() { return buffer_ptr + offset1; } + T* cub_sort_keys_out_ptr() { return buffer_ptr + offset2; } + T* cub_sort_values_ptr() { return buffer_ptr + offset3; } + T* cub_sort_values_out_ptr() { return buffer_ptr + offset4; } + T* bound_index_ptr() { return buffer_ptr + offset5; } + T* bound_value_ptr() { return buffer_ptr + offset6; } + T* class_interval_ptr() { return buffer_ptr + offset7; } + void* cub_temp_storage_ptr() { + return reinterpret_cast(buffer_ptr + offset8); + } + + private: + DenseTensor buffer; + T* buffer_ptr; + int offset1; + int offset2; + int offset3; + int offset4; + int offset5; + int offset6; + int offset7; + int offset8; + int offset9; +}; + +template +void ClassCenterSampleKernel(const Context& dev_ctx, + const DenseTensor& label, + int num_classes, + int num_samples, + int ring_id, + int rank, + int nranks, + bool fix_seed, + int seed, + DenseTensor* remapped_label, + DenseTensor* sampled_local_class_center) { + PADDLE_ENFORCE_GT(num_classes, + 0, + errors::InvalidArgument( + "The value 'num_classes' for Op(class_center_sample) " + "must be greater than 0, " + "but the value given is %d.", + num_classes)); + + PADDLE_ENFORCE_GT(num_samples, + 0, + errors::InvalidArgument( + "The value 'num_samples' for Op(class_center_sample) " + "must be greater than 0, " + "but the value given is %d.", + num_samples)); + + PADDLE_ENFORCE_LE(num_samples, + num_classes, + errors::InvalidArgument( + "The value 'num_samples' for Op(class_center_sample) " + "must be less than or equal to %d, " + "but the value given is %d.", + num_classes, + num_samples)); + + auto place = dev_ctx.GetPlace(); + + int batch_size = label.numel(); + // Algorithm: + // We first randomly generate a value in [0, num_classes) on each position + // in a array(shape[num_classes]). Then, we mark the element as negative + // value in the array according input label. Now, we can sort the array + // by ascending to ensure that the positive class center always in the + // front of the sorted array. So, we can get the sampled class center + // index by sorted keys. Finally, we can get the rempped label by remap + // the input label according sampled class center. + + // step 1: Calculate num classes per device using nccl all reduce + std::vector shard_dim_vec(nranks + 1, 0); + shard_dim_vec[rank + 1] = num_classes; + DenseTensor num_classes_per_device; + paddle::framework::TensorFromVector( + shard_dim_vec, dev_ctx, &num_classes_per_device); + T* num_classes_per_device_ptr = num_classes_per_device.data(); + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (nranks > 1) { + auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); + if (map->has(ring_id)) { + // Use ProcessGroup + paddle::distributed::ProcessGroup* pg = map->get(ring_id); + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(num_classes_per_device); + out_tensor.push_back(num_classes_per_device); + + paddle::distributed::AllreduceOptions opts; + opts.reduce_op = paddle::distributed::ReduceOp::SUM; + auto task = pg->AllReduce(in_tensor, out_tensor, opts); + task->Wait(); + } else { + const auto& comm = paddle::platform::NCCLCommContext::Instance().Get( + ring_id, dev_ctx.GetPlace()); + // use global calculate stream + const auto calcu_stream = + static_cast( + paddle::platform::DeviceContextPool::Instance().Get( + dev_ctx.GetPlace())) + ->stream(); + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce( + num_classes_per_device_ptr, + num_classes_per_device_ptr, + num_classes_per_device.numel(), + paddle::platform::ToNCCLDataType( + paddle::framework::TransToProtoVarType( + num_classes_per_device.dtype())), + ncclSum, + comm->comm(), + calcu_stream)); + } + } +#endif + + // step 2: Determine temporary device storage requirements + int num_buffer_ele = std::max(batch_size, num_classes); + size_t cub_sort_temp_store_size = 0; + PADDLE_ENFORCE_GPU_SUCCESS( + (cub::DeviceRadixSort::SortPairs(nullptr, + cub_sort_temp_store_size, + nullptr, + nullptr, + nullptr, + nullptr, + num_buffer_ele, + 0, + sizeof(T) * 8, + dev_ctx.stream()))); + + size_t cub_sum_temp_store_size = 0; + NotEqualToPreviousAdjacentIterator unique_counting_iter_temp(nullptr, 0); + PADDLE_ENFORCE_GPU_SUCCESS( + (cub::DeviceScan::InclusiveSum, T*>( + nullptr, + cub_sum_temp_store_size, + unique_counting_iter_temp, + nullptr, + batch_size, + dev_ctx.stream()))); + + size_t cub_scan_temp_store_size = 0; + ActualNumSampledFunctor actual_num_sampled_op_temp(num_samples); + PADDLE_ENFORCE_GPU_SUCCESS( + (cub::DeviceScan::InclusiveScan(nullptr, + cub_scan_temp_store_size, + num_classes_per_device_ptr, + num_classes_per_device_ptr, + actual_num_sampled_op_temp, + nranks + 1, + dev_ctx.stream()))); + + size_t cub_temp_storage_bytes = + std::max(std::max(cub_sort_temp_store_size, cub_scan_temp_store_size), + cub_sum_temp_store_size); + int num_temp_ele = cub_temp_storage_bytes / sizeof(T) + 1; + + // step 3: Alloc buffer memory so that we can reuse allocated memory + MemoryBuffer memory_buffer = + MemoryBuffer(num_buffer_ele, num_temp_ele, nranks, dev_ctx); + + T* cub_sort_keys_ptr = memory_buffer.cub_sort_keys_ptr(); + T* cub_sort_keys_out_ptr = memory_buffer.cub_sort_keys_out_ptr(); + T* cub_sort_values_ptr = memory_buffer.cub_sort_values_ptr(); + T* cub_sort_values_out_ptr = memory_buffer.cub_sort_values_out_ptr(); + T* bound_index_ptr = memory_buffer.bound_index_ptr(); + T* bound_value_ptr = memory_buffer.bound_value_ptr(); + T* class_interval_ptr = memory_buffer.class_interval_ptr(); + void* cub_temp_storage_ptr = memory_buffer.cub_temp_storage_ptr(); + + // step 4: Calculate class interval among nranks + PADDLE_ENFORCE_GPU_SUCCESS( + (cub::DeviceScan::InclusiveSum(cub_temp_storage_ptr, + cub_temp_storage_bytes, + num_classes_per_device_ptr, + class_interval_ptr, + nranks + 1, + dev_ctx.stream()))); + + // step 5: random sample negative class center + uint64_t seed_data; + uint64_t increment; + int vec_size = VectorizedSize(cub_sort_keys_ptr); + auto offset = ((num_classes - 1) / + (NumBlocks(num_classes) * kNumCUDAThreads * vec_size) + + 1) * + vec_size; + // auto gen_cuda = paddle::framework::DefaultCUDAGenerator(device_id); + auto gen_cuda = dev_ctx.GetGenerator(); + if (!fix_seed) { + auto seed_offset = gen_cuda->IncrementOffset(offset); + seed_data = seed_offset.first; + increment = seed_offset.second; + } else { + seed_data = seed + rank; + increment = offset; + } + RandomSampleClassCenter + <<>>( + num_classes, seed_data, increment, num_classes, cub_sort_keys_ptr); + + // step 6: mark positive class center as negative value + // fill the sort values to index 0, 1, ..., batch_size-1 + MarkPositiveClassCenter + <<>>( + batch_size, + rank, + class_interval_ptr, + num_classes, + label.data(), + cub_sort_keys_ptr); + Range<<>>( + num_buffer_ele, cub_sort_values_ptr); + + // step 7: sort class center by ascending, so that positive class center + // always be sampled. + PADDLE_ENFORCE_GPU_SUCCESS( + (cub::DeviceRadixSort::SortPairs(cub_temp_storage_ptr, + cub_temp_storage_bytes, + cub_sort_keys_ptr, + cub_sort_keys_out_ptr, + cub_sort_values_ptr, + cub_sort_values_out_ptr, + num_classes, + 0, + sizeof(T) * 8, + dev_ctx.stream()))); + + // step 8: sort input label ascending + PADDLE_ENFORCE_GPU_SUCCESS( + (cub::DeviceRadixSort::SortPairs(cub_temp_storage_ptr, + cub_temp_storage_bytes, + label.data(), + cub_sort_keys_out_ptr, + cub_sort_values_ptr, + cub_sort_keys_ptr, + batch_size, + 0, + sizeof(T) * 8, + dev_ctx.stream()))); + + // step 9: Calculate new index using InclusiveSum on ascending sorted input + // label + NotEqualToPreviousAdjacentIterator unique_counting_iter( + cub_sort_keys_out_ptr, 0); + PADDLE_ENFORCE_GPU_SUCCESS( + (cub::DeviceScan::InclusiveSum, T*>( + cub_temp_storage_ptr, + cub_temp_storage_bytes, + unique_counting_iter, + cub_sort_values_ptr, + batch_size, + dev_ctx.stream()))); + + // step 10: Calculate new class center bound among ranks + GetClassCenterBound + <<>>( + batch_size, + nranks, + class_interval_ptr, + cub_sort_keys_out_ptr, + cub_sort_values_ptr, + bound_index_ptr, + bound_value_ptr); + + // step 11: Calculate actual number of sampled class per device. + // Since maybe num_positive_class_center > num_samples, + // we need to ensure all positive class center per device are sampled. + ActualNumSampledFunctor actual_num_sampled_op(num_samples); + PADDLE_ENFORCE_GPU_SUCCESS( + (cub::DeviceScan::InclusiveScan(cub_temp_storage_ptr, + cub_temp_storage_bytes, + bound_value_ptr, + num_classes_per_device_ptr, + actual_num_sampled_op, + nranks + 1, + dev_ctx.stream()))); + + // step 12: Calculate actual sampled class interval among nranks + PADDLE_ENFORCE_GPU_SUCCESS( + (cub::DeviceScan::InclusiveSum(cub_temp_storage_ptr, + cub_temp_storage_bytes, + num_classes_per_device_ptr, + class_interval_ptr, + nranks + 1, + dev_ctx.stream()))); + + // step 13: Get remapped label for output + GetRemappedLabel + <<>>( + batch_size, + nranks, + class_interval_ptr, + bound_index_ptr, + bound_value_ptr, + cub_sort_keys_ptr, + cub_sort_values_ptr, + dev_ctx.template Alloc(remapped_label)); + + // step 14: Get sampled class center for output + phi::Copy(dev_ctx, + num_classes_per_device, + phi::CPUPlace(), + true, + &num_classes_per_device); + T actual_num_samples = num_classes_per_device.data()[rank + 1]; + sampled_local_class_center->Resize(phi::make_ddim({actual_num_samples})); + + T* sampled_local_class_center_ptr = + dev_ctx.template Alloc(sampled_local_class_center); + paddle::memory::Copy(dev_ctx.GetPlace(), + sampled_local_class_center_ptr, + dev_ctx.GetPlace(), + cub_sort_values_out_ptr, + actual_num_samples * sizeof(T), + nullptr); +} +} // namespace phi + +PD_REGISTER_KERNEL(class_center_sample, + GPU, + ALL_LAYOUT, + phi::ClassCenterSampleKernel, + int64_t, + int) {} diff --git a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu index 841d98fbc003e..4aa59cded8f37 100644 --- a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu @@ -24,29 +24,41 @@ template void DropoutGradRawKernel(const Context& dev_ctx, const DenseTensor& mask, const DenseTensor& out_grad, - float p, + const Scalar& p, bool is_test, const std::string& mode, DenseTensor* x_grad) { bool upscale_in_train = (mode == "upscale_in_train"); x_grad->mutable_data(dev_ctx.GetPlace()); - paddle::operators::DropoutGradGPUKernelDriver( - dev_ctx, is_test, p, upscale_in_train, out_grad, mask, x_grad, false); + paddle::operators::DropoutGradGPUKernelDriver(dev_ctx, + is_test, + p.to(), + upscale_in_train, + out_grad, + mask, + x_grad, + false); } template void DropoutNdGradKernel(const Context& dev_ctx, const DenseTensor& mask, const DenseTensor& out_grad, - float p, + const Scalar& p, bool is_test, const std::string& mode, const std::vector& axis, DenseTensor* x_grad) { bool upscale_in_train = (mode == "upscale_in_train"); dev_ctx.template Alloc(x_grad); - paddle::operators::DropoutGradGPUKernelDriver( - dev_ctx, is_test, p, upscale_in_train, out_grad, mask, x_grad, true); + paddle::operators::DropoutGradGPUKernelDriver(dev_ctx, + is_test, + p.to(), + upscale_in_train, + out_grad, + mask, + x_grad, + true); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/dropout_kernel.cu b/paddle/phi/kernels/gpu/dropout_kernel.cu index f973bb8e15fc7..0f2a8d9c93848 100644 --- a/paddle/phi/kernels/gpu/dropout_kernel.cu +++ b/paddle/phi/kernels/gpu/dropout_kernel.cu @@ -24,7 +24,7 @@ template void DropoutRawKernel(const Context& dev_ctx, const DenseTensor& x, const paddle::optional& seed_tensor, - float p, + const Scalar& p, bool is_test, const std::string& mode, int seed, @@ -36,7 +36,7 @@ void DropoutRawKernel(const Context& dev_ctx, mask->mutable_data(dev_ctx.GetPlace()); paddle::operators::DropoutFwGPUKernelDriver(dev_ctx, is_test, - p, + p.to(), upscale_in_train, fix_seed, seed, @@ -51,7 +51,7 @@ template void DropoutNdKernel(const Context& dev_ctx, const DenseTensor& x, const paddle::optional& seed_tensor, - float p, + const Scalar& p, bool is_test, const std::string& mode, int seed, @@ -64,7 +64,7 @@ void DropoutNdKernel(const Context& dev_ctx, dev_ctx.template Alloc(mask); paddle::operators::DropoutFwGPUKernelDriver(dev_ctx, is_test, - p, + p.to(), upscale_in_train, fix_seed, seed, diff --git a/paddle/phi/kernels/gpu/fft_grad_kernel.cu b/paddle/phi/kernels/gpu/fft_grad_kernel.cu new file mode 100644 index 0000000000000..69a95cffc3ee0 --- /dev/null +++ b/paddle/phi/kernels/gpu/fft_grad_kernel.cu @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fft_grad_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/fft_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(fft_c2c_grad, + GPU, + ALL_LAYOUT, + phi::FFTC2CGradKernel, + phi::dtype::complex, + phi::dtype::complex) {} +PD_REGISTER_KERNEL( + fft_c2r_grad, GPU, ALL_LAYOUT, phi::FFTC2RGradKernel, float, double) {} +PD_REGISTER_KERNEL(fft_r2c_grad, + GPU, + ALL_LAYOUT, + phi::FFTR2CGradKernel, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/fft_kernel.cu b/paddle/phi/kernels/gpu/fft_kernel.cu new file mode 100644 index 0000000000000..aaa1ed0c225c6 --- /dev/null +++ b/paddle/phi/kernels/gpu/fft_kernel.cu @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fft_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/fft_kernel_impl.h" + +PD_REGISTER_KERNEL(fft_c2c, + GPU, + ALL_LAYOUT, + phi::FFTC2CKernel, + phi::dtype::complex, + phi::dtype::complex) {} +PD_REGISTER_KERNEL(fft_c2r, + GPU, + ALL_LAYOUT, + phi::FFTC2RKernel, + phi::dtype::complex, + phi::dtype::complex) {} +PD_REGISTER_KERNEL(fft_r2c, GPU, ALL_LAYOUT, phi::FFTR2CKernel, float, double) { +} diff --git a/paddle/phi/kernels/gpu/fill_diagonal_grad_kernel.cu b/paddle/phi/kernels/gpu/fill_diagonal_grad_kernel.cu index 8884dfae17820..0a1069cf3a90a 100644 --- a/paddle/phi/kernels/gpu/fill_diagonal_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/fill_diagonal_grad_kernel.cu @@ -18,6 +18,7 @@ #include #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/common_shape.h" namespace phi { @@ -60,7 +61,7 @@ void FillDiagonalGradKernel(const Context& ctx, auto size = x_grad->numel(); auto out_dims = x_grad->dims(); - auto strides = CalStride(out_dims); + auto strides = funcs::CalStride(out_dims); auto wrapsize = std::min(size, out_dims[1] * out_dims[1]); // The wrap mode supported only the dims equels to 2; In wrap mode, the diff --git a/paddle/phi/kernels/gpu/fill_diagonal_kernel.cu b/paddle/phi/kernels/gpu/fill_diagonal_kernel.cu index 3116842002abd..1d342abc31745 100644 --- a/paddle/phi/kernels/gpu/fill_diagonal_kernel.cu +++ b/paddle/phi/kernels/gpu/fill_diagonal_kernel.cu @@ -19,6 +19,7 @@ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/common_shape.h" namespace phi { @@ -63,7 +64,7 @@ void FillDiagonalKernel(const Context& ctx, auto size = out->numel(); auto out_dims = out->dims(); - auto strides = CalStride(out_dims); + auto strides = funcs::CalStride(out_dims); // The wrap mode supported only the dims equels to 2; In wrap mode, the // value will be filled in cycles diff --git a/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu b/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu new file mode 100644 index 0000000000000..0e302b23ee98c --- /dev/null +++ b/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu @@ -0,0 +1,114 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fill_diagonal_tensor_grad_kernel.h" + +#include +#include + +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +__global__ void fill_grad_kernel(int64_t size, + T *out_data, + int64_t *strides, + int64_t *matdim, + int64_t offset, + int64_t fill_dims0, + int64_t fill_dims1) { + int64_t i = blockIdx.x; + auto sumoff = matdim[i] + offset; + for (int64_t j = threadIdx.x; j < fill_dims1; j += blockDim.x) { + auto fill_index = j * (strides[1] + strides[0]) + sumoff; + if (fill_index < size) { + out_data[fill_index] = T(0); + } + } +} + +template +void FillDiagonalTensorGradKernel(const Context &ctx, + const DenseTensor &out_grad, + int64_t offset, + int dim1, + int dim2, + DenseTensor *x_grad) { +#ifdef __HIPCC__ + const int64_t kMaxBlockDim = 256; +#else + const int64_t kMaxBlockDim = 512; +#endif + auto matrows = 1; + + if (x_grad) { + auto *data = ctx.template Alloc(x_grad); + auto dx_dims = x_grad->dims(); + phi::Copy(ctx, out_grad, ctx.GetPlace(), false, x_grad); + + for (int i = 0; i < dx_dims.size(); i++) { + if (i != dim1 && i != dim2) { + matrows *= dx_dims[i]; + } + } + + int64_t new_dims[2]; + std::vector memory_block; + memory_block.resize(2 + matrows); + int64_t *strides = &memory_block[0]; + int64_t *matdim = &memory_block[2]; + CalMatDims(dx_dims, dim1, dim2, &offset, new_dims, strides, matdim); + + auto size = x_grad->numel(); + + auto stream = ctx.stream(); + DenseTensor tensor_tmp; + tensor_tmp.Resize(phi::make_ddim({2 + matrows})); + int64_t *memory_block_cu = ctx.template Alloc(&tensor_tmp); + const auto gpu_place = ctx.GetPlace(); + paddle::memory::Copy(gpu_place, + memory_block_cu, + CPUPlace(), + memory_block.data(), + sizeof(int64_t) * (2 + matrows), + stream); + + int64_t *strides_cu = &memory_block_cu[0], *matdim_cu = &memory_block_cu[2]; + + auto kGridDim = new_dims[0]; + auto kBlockDim = std::min(int64_t(new_dims[1]), kMaxBlockDim); + fill_grad_kernel<<>>( + size, data, strides_cu, matdim_cu, offset, new_dims[0], new_dims[1]); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(fill_diagonal_tensor_grad, + GPU, + ALL_LAYOUT, + phi::FillDiagonalTensorGradKernel, + float, + double, + int64_t, + int, + int8_t, + uint8_t, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex, + bool) {} diff --git a/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu b/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu new file mode 100644 index 0000000000000..739a8666e3143 --- /dev/null +++ b/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu @@ -0,0 +1,136 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fill_diagonal_tensor_kernel.h" + +#include +#include + +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_utils.h" + +namespace phi { + +template +__global__ void fill_diagonal_tensor_kernel(int64_t size, + T *out_data, + const T *fill_data, + int64_t *strides, + int64_t *matdim, + int64_t offset, + int64_t fill_dims0, + int64_t fill_dims1) { + int64_t i = blockIdx.x; + auto sumoff = matdim[i] + offset; + for (int64_t j = threadIdx.x; j < fill_dims1; j += blockDim.x) { + auto fill_index = j * (strides[1] + strides[0]) + sumoff; + if (fill_index < size) { + out_data[fill_index] = fill_data[i * fill_dims1 + j]; + } + } +} + +template +void FillDiagonalTensorKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &y, + int64_t offset, + int dim1, + int dim2, + DenseTensor *out) { +#ifdef __HIPCC__ + const int64_t kMaxBlockDim = 256; +#else + const int64_t kMaxBlockDim = 512; +#endif + phi::Copy(ctx, x, ctx.GetPlace(), false, out); + + T *out_data = ctx.template Alloc(out); + const T *fill_data = y.data(); + + auto out_dims = out->dims(); + auto matdims = y.dims(); + auto fill_dims = phi::flatten_to_2d(matdims, matdims.size() - 1); + + int64_t new_dims[2]; + std::vector memory_block; + memory_block.resize(2 + fill_dims[0]); + int64_t *strides = &(memory_block[0]); + int64_t *matdim = &(memory_block[2]); + CalMatDims(out_dims, dim1, dim2, &offset, new_dims, strides, matdim); + PADDLE_ENFORCE_EQ( + new_dims[0], + fill_dims[0], + errors::InvalidArgument("The dims should be %d x %d, but get " + "%d x %d in fill tensor Y", + new_dims[0], + new_dims[1], + fill_dims[0], + fill_dims[1])); + PADDLE_ENFORCE_EQ( + new_dims[1], + fill_dims[1], + errors::InvalidArgument("The dims should be %d x %d, but get " + "%d x %d in fill tensor Y", + new_dims[0], + new_dims[1], + fill_dims[0], + fill_dims[1])); + + auto size = out->numel(); + + auto stream = ctx.stream(); + DenseTensor tensor_tmp; + tensor_tmp.Resize(phi::make_ddim({2 + fill_dims[0]})); + int64_t *memory_block_cu = ctx.template Alloc(&tensor_tmp); + const auto gpu_place = ctx.GetPlace(); + paddle::memory::Copy(gpu_place, + memory_block_cu, + CPUPlace(), + memory_block.data(), + sizeof(int64_t) * (2 + fill_dims[0]), + stream); + + int64_t *strides_cu = &memory_block_cu[0], *matdim_cu = &memory_block_cu[2]; + + auto kGridDim = new_dims[0]; + auto kBlockDim = std::min(int64_t(new_dims[1]), kMaxBlockDim); + fill_diagonal_tensor_kernel + <<>>(size, + out_data, + fill_data, + strides_cu, + matdim_cu, + offset, + fill_dims[0], + fill_dims[1]); +} + +} // namespace phi + +PD_REGISTER_KERNEL(fill_diagonal_tensor, + GPU, + ALL_LAYOUT, + phi::FillDiagonalTensorKernel, + float, + double, + int64_t, + int, + int8_t, + uint8_t, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex, + bool) {} diff --git a/paddle/phi/kernels/gpu/fill_grad_kernel.cu b/paddle/phi/kernels/gpu/fill_grad_kernel.cu new file mode 100644 index 0000000000000..32559ba95dfbc --- /dev/null +++ b/paddle/phi/kernels/gpu/fill_grad_kernel.cu @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fill_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/fill_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(fill_grad, + GPU, + ALL_LAYOUT, + phi::FillGradKernel, + float, + double, + int64_t, + int, + paddle::platform::float16, + bool) {} diff --git a/paddle/phi/kernels/gpu/fill_kernel.cu b/paddle/phi/kernels/gpu/fill_kernel.cu new file mode 100644 index 0000000000000..141e47b8cb109 --- /dev/null +++ b/paddle/phi/kernels/gpu/fill_kernel.cu @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fill_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/fill_kernel_impl.h" + +PD_REGISTER_KERNEL(fill, + GPU, + ALL_LAYOUT, + phi::FillKernel, + float, + double, + int64_t, + int, + paddle::platform::float16, + bool) {} diff --git a/paddle/phi/kernels/gpu/fold_grad_kernel.cu b/paddle/phi/kernels/gpu/fold_grad_kernel.cu new file mode 100644 index 0000000000000..ad469dd7981de --- /dev/null +++ b/paddle/phi/kernels/gpu/fold_grad_kernel.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fold_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/fold_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + fold_grad, GPU, ALL_LAYOUT, phi::FoldGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/fold_kernel.cu b/paddle/phi/kernels/gpu/fold_kernel.cu new file mode 100644 index 0000000000000..b53ef402150c2 --- /dev/null +++ b/paddle/phi/kernels/gpu/fold_kernel.cu @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/fold_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/fold_kernel_impl.h" + +PD_REGISTER_KERNEL(fold, GPU, ALL_LAYOUT, phi::FoldKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/generate_proposals_v2_kernel.cu b/paddle/phi/kernels/gpu/generate_proposals_v2_kernel.cu new file mode 100644 index 0000000000000..bcda357fd8f94 --- /dev/null +++ b/paddle/phi/kernels/gpu/generate_proposals_v2_kernel.cu @@ -0,0 +1,589 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/generate_proposals_v2_kernel.h" + +#include +#include +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) + +int const kThreadsPerBlock = sizeof(uint64_t) * 8; + +static const double kBBoxClipDefault = std::log(1000.0 / 16.0); + +struct RangeInitFunctor { + int start_; + int delta_; + int *out_; + __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; } +}; + +template +static void SortDescending(const phi::GPUContext &ctx, + const DenseTensor &value, + DenseTensor *value_out, + DenseTensor *index_out) { + int num = static_cast(value.numel()); + DenseTensor index_in_t; + index_in_t.Resize(phi::make_ddim({num})); + int *idx_in = ctx.template Alloc(&index_in_t); + phi::funcs::ForRange for_range(ctx, num); + for_range(RangeInitFunctor{0, 1, idx_in}); + + index_out->Resize(phi::make_ddim({num})); + int *idx_out = ctx.template Alloc(index_out); + + const T *keys_in = value.data(); + value_out->Resize(phi::make_ddim({num})); + T *keys_out = ctx.template Alloc(value_out); + + // Determine temporary device storage requirements + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairsDescending(nullptr, + temp_storage_bytes, + keys_in, + keys_out, + idx_in, + idx_out, + num, + 0, + sizeof(T) * 8, + ctx.stream()); + // Allocate temporary storage + auto place = ctx.GetPlace(); + auto d_temp_storage = paddle::memory::Alloc(place, temp_storage_bytes); + + // Run sorting operation + cub::DeviceRadixSort::SortPairsDescending(d_temp_storage->ptr(), + temp_storage_bytes, + keys_in, + keys_out, + idx_in, + idx_out, + num, + 0, + sizeof(T) * 8, + ctx.stream()); +} + +template +struct BoxDecodeAndClipFunctor { + const T *anchor; + const T *deltas; + const T *var; + const int *index; + const T *im_info; + const bool pixel_offset; + + T *proposals; + + BoxDecodeAndClipFunctor(const T *anchor, + const T *deltas, + const T *var, + const int *index, + const T *im_info, + T *proposals, + bool pixel_offset = true) + : anchor(anchor), + deltas(deltas), + var(var), + index(index), + im_info(im_info), + proposals(proposals), + pixel_offset(pixel_offset) {} + + T bbox_clip_default{static_cast(kBBoxClipDefault)}; + + __device__ void operator()(size_t i) { + int k = index[i] * 4; + T axmin = anchor[k]; + T aymin = anchor[k + 1]; + T axmax = anchor[k + 2]; + T aymax = anchor[k + 3]; + + T offset = pixel_offset ? static_cast(1.0) : 0; + T w = axmax - axmin + offset; + T h = aymax - aymin + offset; + T cx = axmin + 0.5 * w; + T cy = aymin + 0.5 * h; + + T dxmin = deltas[k]; + T dymin = deltas[k + 1]; + T dxmax = deltas[k + 2]; + T dymax = deltas[k + 3]; + + T d_cx, d_cy, d_w, d_h; + if (var) { + d_cx = cx + dxmin * w * var[k]; + d_cy = cy + dymin * h * var[k + 1]; + d_w = exp(Min(dxmax * var[k + 2], bbox_clip_default)) * w; + d_h = exp(Min(dymax * var[k + 3], bbox_clip_default)) * h; + } else { + d_cx = cx + dxmin * w; + d_cy = cy + dymin * h; + d_w = exp(Min(dxmax, bbox_clip_default)) * w; + d_h = exp(Min(dymax, bbox_clip_default)) * h; + } + + T oxmin = d_cx - d_w * 0.5; + T oymin = d_cy - d_h * 0.5; + T oxmax = d_cx + d_w * 0.5 - offset; + T oymax = d_cy + d_h * 0.5 - offset; + + proposals[i * 4] = Max(Min(oxmin, im_info[1] - offset), 0.); + proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - offset), 0.); + proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - offset), 0.); + proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - offset), 0.); + } + + __device__ __forceinline__ T Min(T a, T b) const { return a > b ? b : a; } + + __device__ __forceinline__ T Max(T a, T b) const { return a > b ? a : b; } +}; + +template +static __global__ void FilterBBoxes(const T *bboxes, + const T *im_info, + const T min_size, + const int num, + int *keep_num, + int *keep, + bool is_scale = true, + bool pixel_offset = true) { + T im_h = im_info[0]; + T im_w = im_info[1]; + + int cnt = 0; + __shared__ int keep_index[BlockSize]; + + CUDA_KERNEL_LOOP(i, num) { + keep_index[threadIdx.x] = -1; + __syncthreads(); + + int k = i * 4; + T xmin = bboxes[k]; + T ymin = bboxes[k + 1]; + T xmax = bboxes[k + 2]; + T ymax = bboxes[k + 3]; + T offset = pixel_offset ? static_cast(1.0) : 0; + T w = xmax - xmin + offset; + T h = ymax - ymin + offset; + if (pixel_offset) { + T cx = xmin + w / 2.; + T cy = ymin + h / 2.; + + if (is_scale) { + w = (xmax - xmin) / im_info[2] + 1.; + h = (ymax - ymin) / im_info[2] + 1.; + } + + if (w >= min_size && h >= min_size && cx <= im_w && cy <= im_h) { + keep_index[threadIdx.x] = i; + } + } else { + if (w >= min_size && h >= min_size) { + keep_index[threadIdx.x] = i; + } + } + __syncthreads(); + if (threadIdx.x == 0) { + int size = (num - i) < BlockSize ? num - i : BlockSize; + for (int j = 0; j < size; ++j) { + if (keep_index[j] > -1) { + keep[cnt++] = keep_index[j]; + } + } + } + __syncthreads(); + } + if (threadIdx.x == 0) { + keep_num[0] = cnt; + } +} + +static __device__ float IoU(const float *a, + const float *b, + const bool pixel_offset = true) { + float offset = pixel_offset ? static_cast(1.0) : 0; + float left = max(a[0], b[0]), right = min(a[2], b[2]); + float top = max(a[1], b[1]), bottom = min(a[3], b[3]); + float width = max(right - left + offset, 0.f), + height = max(bottom - top + offset, 0.f); + float inter_s = width * height; + float s_a = (a[2] - a[0] + offset) * (a[3] - a[1] + offset); + float s_b = (b[2] - b[0] + offset) * (b[3] - b[1] + offset); + return inter_s / (s_a + s_b - inter_s); +} + +static __global__ void NMSKernel(const int n_boxes, + const float nms_overlap_thresh, + const float *dev_boxes, + uint64_t *dev_mask, + bool pixel_offset = true) { + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + const int row_size = + min(n_boxes - row_start * kThreadsPerBlock, kThreadsPerBlock); + const int col_size = + min(n_boxes - col_start * kThreadsPerBlock, kThreadsPerBlock); + + __shared__ float block_boxes[kThreadsPerBlock * 4]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 4 + 0] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 0]; + block_boxes[threadIdx.x * 4 + 1] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 1]; + block_boxes[threadIdx.x * 4 + 2] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 2]; + block_boxes[threadIdx.x * 4 + 3] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 3]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = kThreadsPerBlock * row_start + threadIdx.x; + const float *cur_box = dev_boxes + cur_box_idx * 4; + int i = 0; + uint64_t t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (IoU(cur_box, block_boxes + i * 4, pixel_offset) > + nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = DIVUP(n_boxes, kThreadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } +} + +template +static void NMS(const phi::GPUContext &ctx, + const DenseTensor &proposals, + const DenseTensor &sorted_indices, + const T nms_threshold, + DenseTensor *keep_out, + bool pixel_offset = true) { + int boxes_num = proposals.dims()[0]; + const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock); + dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock), + DIVUP(boxes_num, kThreadsPerBlock)); + dim3 threads(kThreadsPerBlock); + + const T *boxes = proposals.data(); + auto place = ctx.GetPlace(); + auto mask_ptr = + paddle::memory::Alloc(ctx, boxes_num * col_blocks * sizeof(uint64_t)); + uint64_t *mask_dev = reinterpret_cast(mask_ptr->ptr()); + + NMSKernel<<>>( + boxes_num, nms_threshold, boxes, mask_dev, pixel_offset); + + std::vector remv(col_blocks); + memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); + + std::vector mask_host(boxes_num * col_blocks); + paddle::memory::Copy(CPUPlace(), + mask_host.data(), + place, + mask_dev, + boxes_num * col_blocks * sizeof(uint64_t), + ctx.stream()); + + std::vector keep_vec; + int num_to_keep = 0; + for (int i = 0; i < boxes_num; i++) { + int nblock = i / kThreadsPerBlock; + int inblock = i % kThreadsPerBlock; + + if (!(remv[nblock] & (1ULL << inblock))) { + ++num_to_keep; + keep_vec.push_back(i); + uint64_t *p = mask_host.data() + i * col_blocks; + for (int j = nblock; j < col_blocks; j++) { + remv[j] |= p[j]; + } + } + } + keep_out->Resize(phi::make_ddim({num_to_keep})); + int *keep = ctx.template Alloc(keep_out); + paddle::memory::Copy(place, + keep, + CPUPlace(), + keep_vec.data(), + sizeof(int) * num_to_keep, + ctx.stream()); + ctx.Wait(); +} + +template +static std::pair ProposalForOneImage( + const phi::GPUContext &ctx, + const DenseTensor &im_shape, + const DenseTensor &anchors, + const DenseTensor &variances, + const DenseTensor &bbox_deltas, // [M, 4] + const DenseTensor &scores, // [N, 1] + int pre_nms_top_n, + int post_nms_top_n, + float nms_thresh, + float min_size, + float eta, + bool pixel_offset) { + // 1. pre nms + DenseTensor scores_sort, index_sort; + SortDescending(ctx, scores, &scores_sort, &index_sort); + int num = scores.numel(); + int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel() + : pre_nms_top_n; + scores_sort.Resize(phi::make_ddim({pre_nms_num, 1})); + index_sort.Resize(phi::make_ddim({pre_nms_num, 1})); + + // 2. box decode and clipping + DenseTensor proposals; + proposals.Resize(phi::make_ddim({pre_nms_num, 4})); + ctx.template Alloc(&proposals); + + { + phi::funcs::ForRange for_range(ctx, pre_nms_num); + for_range(BoxDecodeAndClipFunctor{anchors.data(), + bbox_deltas.data(), + variances.data(), + index_sort.data(), + im_shape.data(), + proposals.data(), + pixel_offset}); + } + + // 3. filter + DenseTensor keep_index, keep_num_t; + keep_index.Resize(phi::make_ddim({pre_nms_num})); + ctx.template Alloc(&keep_index); + keep_num_t.Resize(phi::make_ddim({1})); + ctx.template Alloc(&keep_num_t); + min_size = std::max(min_size, 1.0f); + auto stream = ctx.stream(); + FilterBBoxes<<<1, 512, 0, stream>>>(proposals.data(), + im_shape.data(), + min_size, + pre_nms_num, + keep_num_t.data(), + keep_index.data(), + false, + pixel_offset); + int keep_num; + const auto gpu_place = ctx.GetPlace(); + paddle::memory::Copy(CPUPlace(), + &keep_num, + gpu_place, + keep_num_t.data(), + sizeof(int), + ctx.stream()); + ctx.Wait(); + keep_index.Resize(phi::make_ddim({keep_num})); + + DenseTensor scores_filter, proposals_filter; + // Handle the case when there is no keep index left + if (keep_num == 0) { + phi::funcs::SetConstant set_zero; + proposals_filter.Resize(phi::make_ddim({1, 4})); + ctx.template Alloc(&proposals_filter); + scores_filter.Resize(phi::make_ddim({1, 1})); + ctx.template Alloc(&scores_filter); + set_zero(ctx, &proposals_filter, static_cast(0)); + set_zero(ctx, &scores_filter, static_cast(0)); + return std::make_pair(proposals_filter, scores_filter); + } + proposals_filter.Resize(phi::make_ddim({keep_num, 4})); + ctx.template Alloc(&proposals_filter); + scores_filter.Resize(phi::make_ddim({keep_num, 1})); + ctx.template Alloc(&scores_filter); + phi::funcs::GPUGather(ctx, proposals, keep_index, &proposals_filter); + phi::funcs::GPUGather(ctx, scores_sort, keep_index, &scores_filter); + + if (nms_thresh <= 0) { + return std::make_pair(proposals_filter, scores_filter); + } + + // 4. nms + DenseTensor keep_nms; + NMS( + ctx, proposals_filter, keep_index, nms_thresh, &keep_nms, pixel_offset); + if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { + keep_nms.Resize(phi::make_ddim({post_nms_top_n})); + } + + DenseTensor scores_nms, proposals_nms; + proposals_nms.Resize(phi::make_ddim({keep_nms.numel(), 4})); + ctx.template Alloc(&proposals_nms); + scores_nms.Resize(phi::make_ddim({keep_nms.numel(), 1})); + ctx.template Alloc(&scores_nms); + phi::funcs::GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); + phi::funcs::GPUGather(ctx, scores_filter, keep_nms, &scores_nms); + + return std::make_pair(proposals_nms, scores_nms); +} + +template +void GenerateProposalsV2Kernel(const Context &ctx, + const DenseTensor &scores, + const DenseTensor &bbox_deltas, + const DenseTensor &im_shape, + const DenseTensor &anchors, + const DenseTensor &variances, + int pre_nms_top_n, + int post_nms_top_n, + float nms_thresh, + float min_size, + float eta, + bool pixel_offset, + DenseTensor *rpn_rois, + DenseTensor *rpn_roi_probs, + DenseTensor *rpn_rois_num) { + PADDLE_ENFORCE_GE( + eta, + 1., + errors::InvalidArgument("Not support adaptive NMS. The attribute 'eta' " + "should not less than 1. But received eta=[%d]", + eta)); + + auto scores_dim = scores.dims(); + int64_t num = scores_dim[0]; + int64_t c_score = scores_dim[1]; + int64_t h_score = scores_dim[2]; + int64_t w_score = scores_dim[3]; + + auto bbox_dim = bbox_deltas.dims(); + int64_t c_bbox = bbox_dim[1]; + int64_t h_bbox = bbox_dim[2]; + int64_t w_bbox = bbox_dim[3]; + + DenseTensor bbox_deltas_swap, scores_swap; + bbox_deltas_swap.Resize(phi::make_ddim({num, h_bbox, w_bbox, c_bbox})); + ctx.template Alloc(&bbox_deltas_swap); + scores_swap.Resize(phi::make_ddim({num, h_score, w_score, c_score})); + ctx.template Alloc(&scores_swap); + + phi::funcs::Transpose trans; + std::vector axis = {0, 2, 3, 1}; + trans(ctx, bbox_deltas, &bbox_deltas_swap, axis); + trans(ctx, scores, &scores_swap, axis); + + DenseTensor tmp_anchors = anchors; + DenseTensor tmp_variances = variances; + tmp_anchors.Resize(phi::make_ddim({tmp_anchors.numel() / 4, 4})); + tmp_variances.Resize(phi::make_ddim({tmp_variances.numel() / 4, 4})); + + rpn_rois->Resize(phi::make_ddim({bbox_deltas.numel() / 4, 4})); + ctx.template Alloc(rpn_rois); + rpn_roi_probs->Resize(phi::make_ddim({scores.numel(), 1})); + ctx.template Alloc(rpn_roi_probs); + + T *rpn_rois_data = rpn_rois->data(); + T *rpn_roi_probs_data = rpn_roi_probs->data(); + + auto place = ctx.GetPlace(); + auto cpu_place = phi::CPUPlace(); + + int64_t num_proposals = 0; + std::vector offset(1, 0); + std::vector tmp_num; + + for (int64_t i = 0; i < num; ++i) { + DenseTensor im_shape_slice = im_shape.Slice(i, i + 1); + DenseTensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); + DenseTensor scores_slice = scores_swap.Slice(i, i + 1); + + bbox_deltas_slice.Resize(phi::make_ddim({h_bbox * w_bbox * c_bbox / 4, 4})); + scores_slice.Resize(phi::make_ddim({h_score * w_score * c_score, 1})); + + std::pair box_score_pair = + ProposalForOneImage(ctx, + im_shape_slice, + tmp_anchors, + tmp_variances, + bbox_deltas_slice, + scores_slice, + pre_nms_top_n, + post_nms_top_n, + nms_thresh, + min_size, + eta, + pixel_offset); + + DenseTensor &proposals = box_score_pair.first; + DenseTensor &nscores = box_score_pair.second; + + paddle::memory::Copy(place, + rpn_rois_data + num_proposals * 4, + place, + proposals.data(), + sizeof(T) * proposals.numel(), + ctx.stream()); + paddle::memory::Copy(place, + rpn_roi_probs_data + num_proposals, + place, + nscores.data(), + sizeof(T) * nscores.numel(), + ctx.stream()); + ctx.Wait(); + num_proposals += proposals.dims()[0]; + offset.emplace_back(num_proposals); + tmp_num.push_back(proposals.dims()[0]); + } + if (rpn_rois_num != nullptr) { + rpn_rois_num->Resize(phi::make_ddim({num})); + ctx.template Alloc(rpn_rois_num); + int *num_data = rpn_rois_num->data(); + paddle::memory::Copy(place, + num_data, + cpu_place, + &tmp_num[0], + sizeof(int) * num, + ctx.stream()); + rpn_rois_num->Resize(phi::make_ddim({num})); + } + phi::LoD lod; + lod.emplace_back(offset); + rpn_rois->Resize(phi::make_ddim({num_proposals, 4})); + rpn_roi_probs->Resize(phi::make_ddim({num_proposals, 1})); +} + +} // namespace phi + +PD_REGISTER_KERNEL(generate_proposals_v2, + GPU, + ALL_LAYOUT, + phi::GenerateProposalsV2Kernel, + float) {} diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu index c1e9184b222e9..8457936eb65bc 100644 --- a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu @@ -37,9 +37,13 @@ namespace phi { template struct DegreeFunctor { const T* col_ptr; - HOSTDEVICE explicit inline DegreeFunctor(const T* x) { this->col_ptr = x; } + int64_t len_col_ptr; + HOSTDEVICE explicit inline DegreeFunctor(const T* x, int64_t len_col_ptr) { + this->col_ptr = x; + this->len_col_ptr = len_col_ptr; + } HOSTDEVICE inline int operator()(T i) const { - return col_ptr[i + 1] - col_ptr[i]; + return i > len_col_ptr - 1 ? 0 : col_ptr[i + 1] - col_ptr[i]; } }; @@ -58,6 +62,7 @@ template __global__ void SampleKernel(const uint64_t rand_seed, int k, const int64_t num_nodes, + const int64_t len_col_ptr, const T* nodes, const T* row, const T* col_ptr, @@ -88,6 +93,10 @@ __global__ void SampleKernel(const uint64_t rand_seed, while (out_row < last_row) { T node = nodes[out_row]; + if (node > len_col_ptr - 1) { + out_row += BLOCK_WARPS; + continue; + } T in_row_start = col_ptr[node]; int deg = col_ptr[node + 1] - in_row_start; int out_row_start = output_ptr[out_row]; @@ -139,10 +148,12 @@ __global__ void SampleKernel(const uint64_t rand_seed, template int GetTotalSampleNum(const thrust::device_ptr input, const T* col_ptr, + int64_t len_col_ptr, thrust::device_ptr output_count, int sample_size, int bs) { - thrust::transform(input, input + bs, output_count, DegreeFunctor(col_ptr)); + thrust::transform( + input, input + bs, output_count, DegreeFunctor(col_ptr, len_col_ptr)); if (sample_size >= 0) { thrust::transform( output_count, output_count + bs, output_count, MaxFunctor(sample_size)); @@ -163,6 +174,7 @@ void SampleNeighbors(const Context& dev_ctx, int sample_size, int bs, int total_sample_num, + int64_t len_col_ptr, bool return_eids) { thrust::device_vector output_ptr; output_ptr.resize(bs); @@ -179,6 +191,7 @@ void SampleNeighbors(const Context& dev_ctx, 0, sample_size, bs, + len_col_ptr, thrust::raw_pointer_cast(input), row, col_ptr, @@ -193,6 +206,7 @@ template __global__ void FisherYatesSampleKernel(const uint64_t rand_seed, int k, const int64_t num_rows, + const int64_t len_col_ptr, const T* in_rows, T* src, const T* dst_count) { @@ -214,6 +228,10 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed, while (out_row < last_row) { const T row = in_rows[out_row]; + if (row > len_col_ptr - 1) { + out_row += BLOCK_WARPS; + continue; + } const T in_row_start = dst_count[row]; const int deg = dst_count[row + 1] - in_row_start; int split; @@ -312,6 +330,7 @@ void FisherYatesSampleNeighbors(const Context& dev_ctx, int sample_size, int bs, int total_sample_num, + int64_t len_col_ptr, bool return_eids) { thrust::device_vector output_ptr; output_ptr.resize(bs); @@ -328,6 +347,7 @@ void FisherYatesSampleNeighbors(const Context& dev_ctx, <<>>(0, sample_size, bs, + len_col_ptr, thrust::raw_pointer_cast(input), perm_data, col_ptr); @@ -365,6 +385,7 @@ void GraphSampleNeighborsKernel( auto* col_ptr_data = col_ptr.data(); auto* x_data = x.data(); int bs = x.dims()[0]; + int64_t len_col_ptr = col_ptr.dims()[0]; const thrust::device_ptr input(x_data); @@ -373,7 +394,7 @@ void GraphSampleNeighborsKernel( thrust::device_ptr output_count(out_count_data); int total_sample_size = GetTotalSampleNum( - input, col_ptr_data, output_count, sample_size, bs); + input, col_ptr_data, len_col_ptr, output_count, sample_size, bs); out->Resize({static_cast(total_sample_size)}); T* out_data = dev_ctx.template Alloc(out); @@ -396,6 +417,7 @@ void GraphSampleNeighborsKernel( sample_size, bs, total_sample_size, + len_col_ptr, return_eids); } else { DenseTensor perm_buffer_out(perm_buffer->type()); @@ -414,6 +436,7 @@ void GraphSampleNeighborsKernel( sample_size, bs, total_sample_size, + len_col_ptr, return_eids); } } else { @@ -431,6 +454,7 @@ void GraphSampleNeighborsKernel( sample_size, bs, total_sample_size, + len_col_ptr, return_eids); } else { DenseTensor perm_buffer_out(perm_buffer->type()); @@ -449,6 +473,7 @@ void GraphSampleNeighborsKernel( sample_size, bs, total_sample_size, + len_col_ptr, return_eids); } } diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h index a93603ae18f1c..4be92ae18629c 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h @@ -81,7 +81,7 @@ __global__ void InputResetMaxCUDAKernel(T* output, size_t input_size, size_t slice_size) { CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - if (*(output + i) == std::numeric_limits::min()) { + if (*(output + i) == std::numeric_limits::lowest()) { *(output + i) = 0; } } diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu index 7ecf352ffe996..4dc2794d9c949 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu @@ -37,20 +37,27 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, DenseTensor* out, DenseTensor* dst_count = nullptr) { const int& index_size = src_index.dims()[0]; - ctx.template Alloc(out); - T* p_output = out->data(); const auto& src_dims = x.dims(); int64_t memset_size = 1; if (out_size <= 0) { + out->Resize(src_dims); for (int i = 0; i < src_dims.size(); ++i) { memset_size *= src_dims[i]; } } else { + // Set out dim following out_size. + std::vector dims_ = phi::vectorize(out->dims()); + if (dims_.size() > 0) { + dims_[0] = out_size; + } + out->Resize(phi::make_ddim(dims_)); memset_size = out_size; for (int i = 1; i < src_dims.size(); ++i) { memset_size *= src_dims[i]; } } + ctx.template Alloc(out); + T* p_output = out->data(); const size_t& memset_bytes = memset_size * sizeof(T); if (pool_type == "SUM" || pool_type == "MEAN") { #ifdef PADDLE_WITH_HIP @@ -63,7 +70,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::min()); + std::numeric_limits::lowest()); } else if (pool_type == "MIN") { thrust::device_ptr p_output_ptr(p_output); thrust::fill(thrust::device, @@ -91,7 +98,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; int64_t grid_tmp = (n + block - 1) / block; int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; - int64_t input_size = src_dims[0]; + int64_t input_size = out_size <= 0 ? src_dims[0] : out_size; if (pool_type == "SUM") { GraphSendRecvSumCUDAFunctor functor; GraphSendRecvCUDAKernel> @@ -103,9 +110,6 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, <<>>( p_src, s_index, d_index, p_output, index_size, slice_size, functor); - if (out_size > 0) { - input_size = out_size; - } int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block; int64_t grid_max = grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx; @@ -117,9 +121,6 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, <<>>( p_src, s_index, d_index, p_output, index_size, slice_size, functor); - if (out_size > 0) { - input_size = out_size; - } int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block; int64_t grid_min = grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx; @@ -130,12 +131,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, GraphSendRecvCUDAKernel> <<>>( p_src, s_index, d_index, p_output, index_size, slice_size, functor); - + dst_count->Resize({input_size}); ctx.template Alloc(dst_count); - int32_t* p_dst_count = dst_count->data(); - if (out_size > 0) { - input_size = out_size; - } + int* p_dst_count = dst_count->data(); #ifdef PADDLE_WITH_HIP hipMemset(p_dst_count, 0, input_size * sizeof(int)); @@ -161,16 +159,29 @@ void GraphSendRecvKernel(const Context& ctx, const DenseTensor& src_index, const DenseTensor& dst_index, const std::string& pool_type, - int64_t out_size, + const IntArray& out_size, DenseTensor* out, DenseTensor* dst_count) { auto index_type = src_index.dtype(); + auto& out_size_data = out_size.GetData(); if (index_type == phi::DataType::INT32) { - GraphSendRecvOpCUDAKernelLaunchHelper( - ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count); + GraphSendRecvOpCUDAKernelLaunchHelper(ctx, + x, + src_index, + dst_index, + pool_type, + out_size_data[0], + out, + dst_count); } else if (index_type == phi::DataType::INT64) { - GraphSendRecvOpCUDAKernelLaunchHelper( - ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count); + GraphSendRecvOpCUDAKernelLaunchHelper(ctx, + x, + src_index, + dst_index, + pool_type, + out_size_data[0], + out, + dst_count); } } diff --git a/paddle/phi/kernels/gpu/lamb_kernel.cu b/paddle/phi/kernels/gpu/lamb_kernel.cu new file mode 100644 index 0000000000000..0b11b94fdb725 --- /dev/null +++ b/paddle/phi/kernels/gpu/lamb_kernel.cu @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/lamb_kernel.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/lamb_kernel_impl.h" + +PD_REGISTER_KERNEL(lamb, + GPU, + ALL_LAYOUT, + phi::LambKernel, + phi::dtype::float16, + float, + double) { + kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); +} diff --git a/paddle/phi/kernels/gpu/margin_cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/margin_cross_entropy_grad_kernel.cu new file mode 100644 index 0000000000000..54813422ef5e8 --- /dev/null +++ b/paddle/phi/kernels/gpu/margin_cross_entropy_grad_kernel.cu @@ -0,0 +1,243 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// old op include, fluid should be removed +#ifdef PADDLE_WITH_HIP +#include +namespace cub = hipcub; +#else +#include +#endif + +#include +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/impl/softmax_kernel_impl.h" +#include "paddle/phi/kernels/margin_cross_entropy_grad_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#endif +#include "paddle/phi/backends/gpu/gpu_context.h" + +namespace phi { + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +void GetClassInterval(const gpuStream_t& stream, + const phi::Place& place, + const Context& dev_ctx, + const int rid, + const int rank, + const int nranks, + const int D, + DenseTensor* class_interval) { + std::vector shard_dim_vec(nranks + 1, 0); + shard_dim_vec[rank + 1] = D; + if (nranks <= 1) { + paddle::framework::TensorFromVector(shard_dim_vec, dev_ctx, class_interval); + return; + } +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + DenseTensor num_classes_per_device; + paddle::framework::TensorFromVector( + shard_dim_vec, dev_ctx, &num_classes_per_device); + int* num_classes_per_device_ptr = num_classes_per_device.data(); + + auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); + if (map->has(rid)) { + // Use ProcessGroup + paddle::distributed::ProcessGroup* pg = map->get(rid); + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(num_classes_per_device); + out_tensor.push_back(num_classes_per_device); + + paddle::distributed::AllreduceOptions opts; + opts.reduce_op = paddle::distributed::ReduceOp::SUM; + auto task = pg->AllReduce(in_tensor, out_tensor, opts); + task->Wait(); + } else { + const auto& comm = + paddle::platform::NCCLCommContext::Instance().Get(rid, place); + // use global calculate stream + const auto calcu_stream = + static_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce( + num_classes_per_device_ptr, + num_classes_per_device_ptr, + num_classes_per_device.numel(), + paddle::platform::ToNCCLDataType(paddle::framework::TransToProtoVarType( + num_classes_per_device.dtype())), + ncclSum, + comm->comm(), + calcu_stream)); + } + + class_interval->Resize({nranks + 1}); + auto class_interval_ptr = dev_ctx.template Alloc(class_interval); + + size_t cub_temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum( + nullptr, cub_temp_storage_bytes, nullptr, nullptr, nranks + 1, stream); + auto cub_temp_storage = paddle::memory::Alloc(place, cub_temp_storage_bytes); + cub::DeviceScan::InclusiveSum(cub_temp_storage->ptr(), + cub_temp_storage_bytes, + num_classes_per_device_ptr, + class_interval_ptr, + nranks + 1, + stream); + return; +#endif +} + +template +__global__ void CalculateGrad(T* logits_grad, + const T* loss_grad, + const T* logits, + const IndexT* label, + const float margin1, + const float margin2, + const float scale, + const int rank, + const int64_t N, + const int64_t D, + const int* class_interval_ptr) { + using MPType = typename phi::dtype::MPTypeTrait::Type; + int start_index = class_interval_ptr[rank]; + CUDA_KERNEL_LOOP(i, N * D) { + auto row = i / D; + auto col = i % D; + if ((col + start_index) == label[row]) { + logits_grad[i] = (logits_grad[i] - static_cast(1.0)) * loss_grad[row]; + if (fabs(margin1 - 1.0) > 1e-8 || fabs(margin2) > 1e-8) { + MPType dout = static_cast(logits_grad[i]); + MPType one = static_cast(1.0f); + MPType x = static_cast(logits[i]); + MPType m1 = static_cast(margin1); + MPType m2 = static_cast(margin2); + + MPType d = m1 * sin(m1 * acos(x) + m2) / sqrt(one - x * x); + logits_grad[i] = static_cast(dout * d); + } + } else { + logits_grad[i] *= loss_grad[row]; + } + if (fabs(scale - 1.0) > 1e-8) { + logits_grad[i] *= static_cast(scale); + } + } +} + +template +void MarginCrossEntropyGradKernel(const Context& dev_ctx, + const DenseTensor& logits, + const DenseTensor& label, + const DenseTensor& softmax, + const DenseTensor& loss_grad, + bool return_softmax, + int ring_id, + int rank, + int nranks, + float margin1, + float margin2, + float margin3, + float scale, + DenseTensor* logits_grad) { + const auto softmax_dims = softmax.dims(); + const int axis = softmax_dims.size() - 1; + const int N = phi::funcs::SizeToAxis(axis, softmax_dims); + const int D = phi::funcs::SizeFromAxis(axis, softmax_dims); + + if (return_softmax) { + phi::Copy( + dev_ctx, softmax, dev_ctx.GetPlace(), false, logits_grad); + } else { + logits_grad->ShareDataWith(softmax); + } + + int blocks = NumBlocks(N * D); + int threads = kNumCUDAThreads; + const auto& label_type = + paddle::framework::TransToProtoVarType(label.dtype()); + + DenseTensor class_interval; + GetClassInterval(dev_ctx.stream(), + dev_ctx.GetPlace(), + dev_ctx, + ring_id, + rank, + nranks, + D, + &class_interval); + + if (label_type == paddle::framework::proto::VarType::INT32) { + typedef int32_t LabelT; + CalculateGrad + <<>>(logits_grad->data(), + loss_grad.data(), + logits.data(), + label.data(), + margin1, + margin2, + scale, + rank, + N, + D, + class_interval.data()); + } else if (label_type == paddle::framework::proto::VarType::INT64) { + typedef int64_t LabelT; + CalculateGrad + <<>>(logits_grad->data(), + loss_grad.data(), + logits.data(), + label.data(), + margin1, + margin2, + scale, + rank, + N, + D, + class_interval.data()); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(margin_cross_entropy_grad, + GPU, + ALL_LAYOUT, + phi::MarginCrossEntropyGradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/margin_cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/margin_cross_entropy_kernel.cu new file mode 100644 index 0000000000000..a92daab07a1fe --- /dev/null +++ b/paddle/phi/kernels/gpu/margin_cross_entropy_kernel.cu @@ -0,0 +1,483 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// old op include, fluid should be removed +#ifdef PADDLE_WITH_HIP +#include +namespace cub = hipcub; +#else +#include +#endif + +#include +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#endif +// trace op include +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +void GetClassInterval(const gpuStream_t& stream, + const phi::Place& place, + const Context& dev_ctx, + const int rid, + const int rank, + const int nranks, + const int D, + DenseTensor* class_interval) { + std::vector shard_dim_vec(nranks + 1, 0); + shard_dim_vec[rank + 1] = D; + if (nranks <= 1) { + paddle::framework::TensorFromVector(shard_dim_vec, dev_ctx, class_interval); + return; + } + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + DenseTensor num_classes_per_device; + paddle::framework::TensorFromVector( + shard_dim_vec, dev_ctx, &num_classes_per_device); + int* num_classes_per_device_ptr = num_classes_per_device.data(); + + auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); + if (map->has(rid)) { + // Use ProcessGroup + paddle::distributed::ProcessGroup* pg = map->get(rid); + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(num_classes_per_device); + out_tensor.push_back(num_classes_per_device); + + paddle::distributed::AllreduceOptions opts; + opts.reduce_op = paddle::distributed::ReduceOp::SUM; + auto task = pg->AllReduce(in_tensor, out_tensor, opts); + task->Wait(); + } else { + const auto& comm = + paddle::platform::NCCLCommContext::Instance().Get(rid, place); + // use global calculate stream + const auto calcu_stream = + static_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce( + num_classes_per_device_ptr, + num_classes_per_device_ptr, + num_classes_per_device.numel(), + paddle::platform::ToNCCLDataType(paddle::framework::TransToProtoVarType( + num_classes_per_device.dtype())), + ncclSum, + comm->comm(), + calcu_stream)); + } + + class_interval->Resize({nranks + 1}); + auto class_interval_ptr = dev_ctx.template Alloc(class_interval); + size_t cub_temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum( + nullptr, cub_temp_storage_bytes, nullptr, nullptr, nranks + 1, stream); + auto cub_temp_storage = paddle::memory::Alloc(place, cub_temp_storage_bytes); + cub::DeviceScan::InclusiveSum(cub_temp_storage->ptr(), + cub_temp_storage_bytes, + num_classes_per_device_ptr, + class_interval_ptr, + nranks + 1, + stream); + return; +#endif +} + +template +__global__ void AddMarginToPositiveLogitsKernel(T* logit, + const IndexT* label, + const float margin1, + const float margin2, + const float margin3, + const int rank, + const int nranks, + const int64_t N, + const int64_t D, + const int* class_interval_ptr) { + using MPType = typename phi::dtype::MPTypeTrait::Type; + int start_index = class_interval_ptr[rank]; + int end_index = class_interval_ptr[rank + 1]; + int num_classes = class_interval_ptr[nranks]; + CUDA_KERNEL_LOOP(i, N) { + auto real_label = label[i]; + PADDLE_ENFORCE((real_label < num_classes) && (real_label >= 0), + "The index is out of bounds, " + "please check whether the value of label and " + "input meet the number of class. It should " + "be less than [%d], but received [%d]", + num_classes, + real_label); + + if (real_label >= start_index && real_label < end_index) { + int64_t offset = i * D + real_label - start_index; + if (fabs(margin1 - 1.0) > 1e-8 || fabs(margin2) > 1e-8) { + MPType x = static_cast(logit[offset]); + MPType theta = acos(x); + if (fabs(margin1 - 1.0) > 1e-8) { + theta *= static_cast(margin1); + } + if (fabs(margin2) > 1e-8) { + theta += static_cast(margin2); + } + logit[offset] = static_cast(cos(theta)); + } + if (fabs(margin3) > 1e-8) { + MPType y = static_cast(logit[offset]); + y -= static_cast(margin3); + logit[offset] = static_cast(y); + } + } + } +} + +template +__global__ void ScaleLogitKernel(T* logits, + const float scale, + const int64_t N, + const int64_t D) { + CUDA_KERNEL_LOOP(i, N * D) { logits[i] *= static_cast(scale); } +} + +template +__global__ void LogitsMinusMaxKernel(T* logits, + const T* logits_max_per_row, + const int64_t N, + const int64_t D) { + CUDA_KERNEL_LOOP(i, N * D) { + auto row = i / D; + logits[i] -= logits_max_per_row[row]; + } +} + +template +__global__ void LogitsMinusLogSumKernel(T* logits, + const T* logits_sum_per_row, + const int64_t N, + const int64_t D) { + CUDA_KERNEL_LOOP(i, N * D) { + auto row = i / D; + logits[i] -= phi::kps::details::Log(logits_sum_per_row[row]); + } +} + +template +__global__ void HardLabelSoftmaxWithCrossEntropyKernel( + T* loss, + T* log_softmax, + const IndexT* labels, + const int rank, + const int64_t N, + const int64_t D, + const int* class_interval_ptr) { + int start_index = class_interval_ptr[rank]; + CUDA_KERNEL_LOOP(i, N * D) { + auto row = i / D; + auto col = i % D; + if ((col + start_index) == labels[row]) { + auto softmax = log_softmax[i]; + loss[row] = -softmax; + log_softmax[i] = phi::kps::details::Exp(softmax); + } else { + log_softmax[i] = phi::kps::details::Exp(log_softmax[i]); + } + } +} + +template +void MarginCrossEntropyKernel(const Context& dev_ctx, + const DenseTensor& logits, + const DenseTensor& labels, + bool return_softmax, + int ring_id, + int rank, + int nranks, + float margin1, + float margin2, + float margin3, + float scale, + DenseTensor* softmax, + DenseTensor* loss) { + const auto& place = dev_ctx.GetPlace(); // old code + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + paddle::platform::NCCLComm* comm; + paddle::distributed::ProcessGroup* pg = nullptr; + gpuStream_t stream; + if (nranks > 1) { + auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); + if (map->has(ring_id)) { + // Use ProcessGroup + pg = map->get(ring_id); + } else { + comm = paddle::platform::NCCLCommContext::Instance().Get(ring_id, place); + + // use global calculate stream + stream = static_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + } + } +#endif + + // allocate memory on device. + T* softmax_ptr = dev_ctx.template Alloc(softmax); + T* loss_ptr = dev_ctx.template Alloc(loss); + + const auto& logits_dims = logits.dims(); + const auto& labels_dims = labels.dims(); + + const int axis = logits_dims.size() - 1; + const int N = phi::funcs::SizeToAxis(axis, logits_dims); + const int D = phi::funcs::SizeFromAxis(axis, logits_dims); + + int blocks = NumBlocks(N); + int threads = kNumCUDAThreads; + const auto& label_type = + paddle::framework::TransToProtoVarType(labels.dtype()); + + // copy logits to softmax variable since we can't modify logits, + // and it also be used when calculate grad + phi::Copy(dev_ctx, logits, dev_ctx.GetPlace(), true, softmax); + + DenseTensor softmax_2d; + softmax_2d.ShareDataWith(*softmax).Resize({N, D}); + T* logits_ptr = softmax_2d.data(); + + DenseTensor class_interval; + GetClassInterval(dev_ctx.stream(), + dev_ctx.GetPlace(), + dev_ctx, + ring_id, + rank, + nranks, + D, + &class_interval); + + // step 1, preprocess logits + // add margin for positive elements + // theta = acos(x_i) + // (cos(m1 * theta + m2) - m3) + // save match_logits, used for gradient computation. + if (label_type == paddle::framework::proto::VarType::INT32) { + typedef int32_t LabelT; + AddMarginToPositiveLogitsKernel + <<>>( + logits_ptr, + labels.data(), + margin1, + margin2, + margin3, + rank, + nranks, + N, + D, + class_interval.data()); + } else if (label_type == paddle::framework::proto::VarType::INT64) { + typedef int64_t LabelT; + AddMarginToPositiveLogitsKernel + <<>>( + logits_ptr, + labels.data(), + margin1, + margin2, + margin3, + rank, + nranks, + N, + D, + class_interval.data()); + } else { + PADDLE_THROW(errors::Unimplemented( + "margin_cross_entropy label type noly support int32 and int64, " + "but got %s", + label_type)); + } + + // scale by s + ScaleLogitKernel<<>>( + logits_ptr, scale, N, D); + + // step 2, obtain logit_max + DenseTensor logits_max; + logits_max.Resize({N, 1}); + dev_ctx.template Alloc(&logits_max); + T* logits_max_buff = dev_ctx.template Alloc(&logits_max); + + phi::funcs:: + ReduceKernel>( + static_cast(dev_ctx), + softmax_2d, + &logits_max, + phi::kps::IdentityFunctor(), + {1}); + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (nranks > 1) { + if (pg) { + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(logits_max); + out_tensor.push_back(logits_max); + + paddle::distributed::AllreduceOptions opts; + opts.reduce_op = paddle::distributed::ReduceOp::MAX; + auto task = pg->AllReduce(in_tensor, out_tensor, opts); + task->Wait(); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce( + logits_max_buff, + logits_max_buff, + logits_max.numel(), + paddle::platform::ToNCCLDataType( + paddle::framework::TransToProtoVarType(logits_max.dtype())), + ncclMax, + comm->comm(), + stream)); + } + } +#endif + + // step 3, logit - logit_max + LogitsMinusMaxKernel<<>>( + logits_ptr, logits_max_buff, N, D); + + // step 4, sum(exp(logit - logit_max)) + DenseTensor sum_exp_logits; + sum_exp_logits.Resize({N, 1}); + dev_ctx.template Alloc(&sum_exp_logits); + T* sum_exp_logits_buff = dev_ctx.template Alloc(&sum_exp_logits); + phi::funcs::ReduceKernel>( + static_cast(dev_ctx), + softmax_2d, + &sum_exp_logits, + phi::kps::ExpFunctor(), + {1}); + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (nranks > 1) { + if (pg) { + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(sum_exp_logits); + out_tensor.push_back(sum_exp_logits); + + paddle::distributed::AllreduceOptions opts; + opts.reduce_op = paddle::distributed::ReduceOp::SUM; + auto task = pg->AllReduce(in_tensor, out_tensor, opts); + task->Wait(); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce( + sum_exp_logits_buff, + sum_exp_logits_buff, + sum_exp_logits.numel(), + paddle::platform::ToNCCLDataType( + paddle::framework::TransToProtoVarType(sum_exp_logits.dtype())), + ncclSum, + comm->comm(), + stream)); + } + } +#endif + + // step 5, (logit - logit_max) - log(sum(exp(logit - logit_max))) + LogitsMinusLogSumKernel + <<>>( + logits_ptr, sum_exp_logits_buff, N, D); + + // step 6, prob = exp((logit - logit_max) - log(sum(exp(logit - + // logit_max)))) + // loss = -((logit_i - logit_max) - log(sum(exp(logit - logit_max)))) + + phi::funcs::SetConstant functor; + functor(dev_ctx, loss, static_cast(0.0)); + if (label_type == paddle::framework::proto::VarType::INT32) { + typedef int32_t LabelT; + HardLabelSoftmaxWithCrossEntropyKernel + <<>>(loss_ptr, + logits_ptr, + labels.data(), + rank, + N, + D, + class_interval.data()); + } else if (label_type == paddle::framework::proto::VarType::INT64) { + typedef int64_t LabelT; + HardLabelSoftmaxWithCrossEntropyKernel + <<>>(loss_ptr, + logits_ptr, + labels.data(), + rank, + N, + D, + class_interval.data()); + } + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (nranks > 1) { + if (pg) { + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(*loss); + out_tensor.push_back(*loss); + + paddle::distributed::AllreduceOptions opts; + opts.reduce_op = paddle::distributed::ReduceOp::SUM; + auto task = pg->AllReduce(in_tensor, out_tensor, opts); + task->Wait(); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce( + loss_ptr, + loss_ptr, + loss->numel(), + paddle::platform::ToNCCLDataType( + paddle::framework::TransToProtoVarType(loss->dtype())), + ncclSum, + comm->comm(), + stream)); + } + } +#endif +} + +} // namespace phi + +PD_REGISTER_KERNEL(margin_cross_entropy, + GPU, + ALL_LAYOUT, + phi::MarginCrossEntropyKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/reduce_amin_amax_common.h b/paddle/phi/kernels/gpu/reduce_amin_amax_common.h index fe3cd89d5bc97..5d90433ad22e3 100644 --- a/paddle/phi/kernels/gpu/reduce_amin_amax_common.h +++ b/paddle/phi/kernels/gpu/reduce_amin_amax_common.h @@ -38,6 +38,9 @@ void ReduceCudaAMaxAMinGrad(const Context& dev_ctx, auto* d_x = x_grad; // get reduce_dim and reduce_num for reduce_mean_grad int dim_size = in_x->dims().size(); + if (dims.size() == 0) { + reduce_all = true; + } auto reduce_dims = funcs::details::GetReduceDim(dims, dim_size, reduce_all); auto update_dims = vectorize(d_x->dims()); int reduce_num = 1; diff --git a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu index 8b111641cfa40..c0955cd7424ae 100644 --- a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu @@ -37,6 +37,9 @@ void ReduceSumGradKernel(const Context& dev_ctx, // get reduce_dim and reduce_num for reduce_mean_grad int dim_size = in_x->dims().size(); + if (dims.size() == 0) { + reduce_all = true; + } std::vector reduce_dims = funcs::details::GetReduceDim(dims, dim_size, reduce_all); diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu index 6f96a697b2f2d..1a574c05494fd 100644 --- a/paddle/phi/kernels/gpu/scale_kernel.cu +++ b/paddle/phi/kernels/gpu/scale_kernel.cu @@ -74,4 +74,6 @@ PD_REGISTER_KERNEL(scale, int8_t, int16_t, int, - int64_t) {} + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu index a1d4b681ca053..ed8a8c333442d 100644 --- a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu @@ -178,7 +178,18 @@ PD_REGISTER_KERNEL(sync_batch_norm, ALL_LAYOUT, phi::SyncBatchNormKernel, float, - phi::dtype::float16) {} + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); + } +} #else PD_REGISTER_KERNEL(sync_batch_norm, GPU, @@ -186,5 +197,16 @@ PD_REGISTER_KERNEL(sync_batch_norm, phi::SyncBatchNormKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); + } +} #endif diff --git a/paddle/phi/kernels/gpu/unpool_grad_kernel.cu b/paddle/phi/kernels/gpu/unpool_grad_kernel.cu new file mode 100644 index 0000000000000..24d4193ed6c0a --- /dev/null +++ b/paddle/phi/kernels/gpu/unpool_grad_kernel.cu @@ -0,0 +1,203 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/unpool_grad_kernel.h" + +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +__global__ void KernelUnpool2dMaxGrad(const int nthreads, + const T* input_data, + const int* indices_data, + const int input_height, + const int input_width, + const int channels, + const T* output_data, + const T* output_grad, + const int output_height, + const int output_width, + T* input_grad) { + CUDA_KERNEL_LOOP(linearIndex, nthreads) { + int c = (linearIndex / input_width / input_height) % channels; + int n = linearIndex / input_width / input_height / channels; + output_grad += (n * channels + c) * output_height * output_width; + int maxind = indices_data[linearIndex]; + input_grad[linearIndex] = output_grad[maxind]; + } +} + +template +__global__ void KernelUnpool3dMaxGrad(const int nthreads, + const T* input_data, + const int* indices_data, + const int input_depth, + const int input_height, + const int input_width, + const int channels, + const T* output_data, + const T* output_grad, + const int output_depth, + const int output_height, + const int output_width, + T* input_grad) { + CUDA_KERNEL_LOOP(linearIndex, nthreads) { + int c = (linearIndex / input_depth / input_width / input_height) % channels; + int n = linearIndex / input_depth / input_width / input_height / channels; + output_grad += + (n * channels + c) * output_depth * output_height * output_width; + int maxind = indices_data[linearIndex]; + input_grad[linearIndex] = output_grad[maxind]; + } +} + +template +class Unpool2dMaxGradFunctor { + public: + void operator()(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& indices, + const DenseTensor& output, + const DenseTensor& output_grad, + DenseTensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output.dims()[1]; + const int output_height = output.dims()[2]; + const int output_width = output.dims()[3]; + const T* input_data = input.data(); + const int* indices_data = indices.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = dev_ctx.template Alloc(input_grad); +#ifdef __HIPCC__ + int threads = 256; +#else + int threads = 1024; +#endif + int grid = (input.numel() + threads - 1) / threads; + KernelUnpool2dMaxGrad + <<>>(input.numel(), + input_data, + indices_data, + input_height, + input_width, + output_channels, + output_data, + output_grad_data, + output_height, + output_width, + input_grad_data); + } +}; + +template +class Unpool3dMaxGradFunctor { + public: + void operator()(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& indices, + const DenseTensor& output, + const DenseTensor& output_grad, + DenseTensor* input_grad) { + const int batch_size = input.dims()[0]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output.dims()[1]; + const int output_depth = output.dims()[2]; + const int output_height = output.dims()[3]; + const int output_width = output.dims()[4]; + const T* input_data = input.data(); + const int* indices_data = indices.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = dev_ctx.template Alloc(input_grad); +#ifdef __HIPCC__ + int threads = 256; +#else + int threads = 1024; +#endif + int grid = (input.numel() + threads - 1) / threads; + KernelUnpool3dMaxGrad + <<>>(input.numel(), + input_data, + indices_data, + input_depth, + input_height, + input_width, + output_channels, + output_data, + output_grad_data, + output_depth, + output_height, + output_width, + input_grad_data); + } +}; + +template +void UnpoolGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out, + const DenseTensor& out_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_size, + const std::string& data_format, + DenseTensor* x_grad) { + T* input_grad_data = dev_ctx.template Alloc(x_grad); + const T* output_grad_data = out_grad.data(); + phi::funcs::SetConstant zero; + zero(dev_ctx, x_grad, static_cast(0)); + Unpool2dMaxGradFunctor unpool2d_max_backward; + unpool2d_max_backward(dev_ctx, x, indices, out, out_grad, x_grad); +} + +template +void Unpool3dGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out, + const DenseTensor& out_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_size, + const std::string& data_format, + DenseTensor* x_grad) { + T* input_grad_data = dev_ctx.template Alloc(x_grad); + const T* output_grad_data = out_grad.data(); + phi::funcs::SetConstant zero; + zero(dev_ctx, x_grad, static_cast(0)); + Unpool3dMaxGradFunctor unpool3d_max_backward; + unpool3d_max_backward(dev_ctx, x, indices, out, out_grad, x_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + unpool_grad, GPU, ALL_LAYOUT, phi::UnpoolGradKernel, float, double) {} + +PD_REGISTER_KERNEL( + unpool3d_grad, GPU, ALL_LAYOUT, phi::Unpool3dGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/unpool_kernel.cu b/paddle/phi/kernels/gpu/unpool_kernel.cu new file mode 100644 index 0000000000000..c9ded2fd822bd --- /dev/null +++ b/paddle/phi/kernels/gpu/unpool_kernel.cu @@ -0,0 +1,188 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/unpool_kernel.h" + +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +__global__ void KernelUnpool2dMax(const int nthreads, + const T* input_data, + const int* indices_data, + const int input_height, + const int input_width, + const int channels, + T* output_data, + const int output_height, + const int output_width){ + CUDA_KERNEL_LOOP(linearIndex, nthreads){ + int c = (linearIndex / input_width / input_height) % channels; +int n = linearIndex / input_width / input_height / channels; +output_data += (n * channels + c) * output_height * output_width; +int maxind = indices_data[linearIndex]; +output_data[maxind] = input_data[linearIndex]; +} // namespace phi +} +; + +template +__global__ void KernelUnpool3dMax( + const int nthreads, + const T* input_data, + const int* indices_data, + const int input_depth, + const int input_height, + const int input_width, + const int channels, + T* output_data, + const int output_depth, + const int output_height, + const int output_width){CUDA_KERNEL_LOOP(linearIndex, nthreads){ + int c = (linearIndex / input_depth / input_width / input_height) % channels; +int n = linearIndex / input_depth / input_width / input_height / channels; +output_data += (n * channels + c) * output_depth * output_height * output_width; +int maxind = indices_data[linearIndex]; +output_data[maxind] = input_data[linearIndex]; +} +} +; + +template +class Unpool2dMaxFunctor { + public: + void operator()(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& indices, + DenseTensor* output) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + const T* input_data = input.data(); + const int* indices_data = indices.data(); + T* output_data = dev_ctx.template Alloc(output); +#ifdef __HIPCC__ + int threads = 256; +#else + int threads = 1024; +#endif + int grid = (input.numel() + threads - 1) / threads; + KernelUnpool2dMax + <<>>(input.numel(), + input_data, + indices_data, + input_height, + input_width, + output_channels, + output_data, + output_height, + output_width); + } +}; + +template +class Unpool3dMaxFunctor { + public: + void operator()(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& indices, + DenseTensor* output) { + const int batch_size = input.dims()[0]; + const int input_depth = input.dims()[2]; + const int input_height = input.dims()[3]; + const int input_width = input.dims()[4]; + const int output_channels = output->dims()[1]; + const int output_depth = output->dims()[2]; + const int output_height = output->dims()[3]; + const int output_width = output->dims()[4]; + const T* input_data = input.data(); + const int* indices_data = indices.data(); + T* output_data = dev_ctx.template Alloc(output); +#ifdef __HIPCC__ + int threads = 256; +#else + int threads = 1024; +#endif + int grid = (input.numel() + threads - 1) / threads; + KernelUnpool3dMax + <<>>(input.numel(), + input_data, + indices_data, + input_depth, + input_height, + input_width, + output_channels, + output_data, + output_depth, + output_height, + output_width); + } +}; + +template +void UnpoolKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_size, + const std::string& data_format, + DenseTensor* out) { + T* output_data = dev_ctx.template Alloc(out); + if (output_data) { + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, out, static_cast(0)); + } + + Unpool2dMaxFunctor unpool2d_max_forward; + unpool2d_max_forward(dev_ctx, x, indices, out); +} + +template +void Unpool3dKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_size, + const std::string& data_format, + DenseTensor* out) { + T* output_data = dev_ctx.template Alloc(out); + if (output_data) { + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, out, static_cast(0)); + } + + Unpool3dMaxFunctor unpool3d_max_forward; + unpool3d_max_forward(dev_ctx, x, indices, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + unpool, GPU, ALL_LAYOUT, phi::UnpoolKernel, int, float, double) {} + +PD_REGISTER_KERNEL( + unpool3d, GPU, ALL_LAYOUT, phi::Unpool3dKernel, int, float, double) {} diff --git a/paddle/phi/kernels/graph_send_recv_kernel.h b/paddle/phi/kernels/graph_send_recv_kernel.h index 8f635225b75a4..cd625c92b93ea 100644 --- a/paddle/phi/kernels/graph_send_recv_kernel.h +++ b/paddle/phi/kernels/graph_send_recv_kernel.h @@ -16,6 +16,7 @@ #include +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { @@ -26,7 +27,7 @@ void GraphSendRecvKernel(const Context& ctx, const DenseTensor& src_index, const DenseTensor& dst_index, const std::string& pool_type, - int64_t out_size, + const IntArray& out_size, DenseTensor* out, DenseTensor* dst_count); diff --git a/paddle/phi/kernels/impl/fft_grad_kernel_impl.h b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h new file mode 100644 index 0000000000000..02a8fbd14bca9 --- /dev/null +++ b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h @@ -0,0 +1,110 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/kernels/fft_grad_kernel.h" + +#include +#include + +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/fft.h" +#include "paddle/phi/kernels/funcs/fft_fill_conj.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/pad_kernel.h" + +namespace phi { +template +void FFTC2CGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const std::vector& axes, + const std::string& normalization, + bool forward, + DenseTensor* x_grad) { + ctx.template Alloc(x_grad); + auto norm_type = funcs::get_norm_from_string(normalization, forward); + funcs::FFTC2CFunctor fft_c2c_func; + fft_c2c_func(ctx, out_grad, x_grad, axes, norm_type, !forward); +} + +template +void FFTR2CGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& axes, + const std::string& normalization, + bool forward, + bool onesided, + DenseTensor* x_grad) { + using R = typename T::value_type; + DenseTensor complex_x_grad = EmptyLike(ctx, x); + ctx.template Alloc(x_grad); + auto norm_type = funcs::get_norm_from_string(normalization, forward); + funcs::FFTC2CFunctor fft_c2c_func; + + if (!onesided) { + fft_c2c_func(ctx, out_grad, &complex_x_grad, axes, norm_type, !forward); + } else { + DenseTensor full_dy; + DenseTensorMeta full_dy_meta(out_grad.type(), x_grad->dims()); + full_dy.set_meta(full_dy_meta); + auto zero_length = static_cast(full_dy.dims().at(axes.back()) - + out_grad.dims().at(axes.back())); + auto rank = out_grad.dims().size(); + std::vector pads(rank * 2, 0); + pads[axes.back() * 2 + 1] = zero_length; + PadKernel(ctx, out_grad, pads, static_cast(0.0), &full_dy); + fft_c2c_func(ctx, full_dy, &complex_x_grad, axes, norm_type, !forward); + } + RealKernel(ctx, complex_x_grad, x_grad); +} + +template +void FFTC2RGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const std::vector& axes, + const std::string& normalization, + bool forward, + int64_t last_dim_size, + DenseTensor* x_grad) { + using C = phi::dtype::complex; + ctx.template Alloc(x_grad); + auto norm_type = funcs::get_norm_from_string(normalization, forward); + + funcs::FFTR2CFunctor fft_r2c_func; + fft_r2c_func(ctx, out_grad, x_grad, axes, norm_type, !forward); + + const int64_t double_length = + out_grad.dims()[axes.back()] - x_grad->dims()[axes.back()]; + const phi::DDim strides = phi::stride(x_grad->dims()); + +#if defined(__NVCC__) || defined(__HIPCC__) + const thrust::device_vector strides_g(phi::vectorize(strides)); + const int64_t* pstrides = thrust::raw_pointer_cast(strides_g.data()); +#else + const int64_t* pstrides = strides.Get(); +#endif + + funcs::FFTFillConjGradFunctor func( + x_grad->data(), axes.back(), pstrides, double_length); + size_t limit = x_grad->numel(); + funcs::ForRange for_range(ctx, limit); + for_range(func); +} +} // namespace phi diff --git a/paddle/phi/kernels/impl/fft_kernel_impl.h b/paddle/phi/kernels/impl/fft_kernel_impl.h new file mode 100644 index 0000000000000..d441093db07c9 --- /dev/null +++ b/paddle/phi/kernels/impl/fft_kernel_impl.h @@ -0,0 +1,83 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/kernels/fft_kernel.h" + +#include +#include + +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/fft.h" +#include "paddle/phi/kernels/funcs/fft_fill_conj.h" + +namespace phi { +template +void FFTC2CKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + DenseTensor* out) { + ctx.template Alloc(out); + const auto norm_type = funcs::get_norm_from_string(normalization, forward); + funcs::FFTC2CFunctor fft_c2c_func; + fft_c2c_func(ctx, x, out, axes, norm_type, forward); +} + +template +void FFTC2RKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + int64_t last_dim_size, + DenseTensor* out) { + using R = typename T::value_type; // get real type + ctx.template Alloc(out); + const auto norm_type = funcs::get_norm_from_string(normalization, forward); + funcs::FFTC2RFunctor fft_c2r_func; + fft_c2r_func(ctx, x, out, axes, norm_type, forward); +} + +template +void FFTR2CKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axes, + const std::string& normalization, + bool forward, + bool onesided, + DenseTensor* out) { + using C = phi::dtype::complex; + ctx.template Alloc(out); + auto norm_type = funcs::get_norm_from_string(normalization, forward); + funcs::FFTR2CFunctor fft_r2c_func; + + if (onesided) { + fft_r2c_func(ctx, x, out, axes, norm_type, forward); + } else { + phi::DDim onesided_out_shape = x.dims(); + const int64_t last_fft_axis = axes.back(); + const int64_t onesided_last_axis_size = + out->dims().at(last_fft_axis) / 2 + 1; + onesided_out_shape[last_fft_axis] = onesided_last_axis_size; + DenseTensor onesided_out = + Empty(ctx, phi::vectorize(onesided_out_shape)); + fft_r2c_func(ctx, x, &onesided_out, axes, norm_type, forward); + funcs::FFTFillConj(ctx, &onesided_out, out, axes); + } +} +} // namespace phi diff --git a/paddle/phi/kernels/impl/fill_grad_kernel_impl.h b/paddle/phi/kernels/impl/fill_grad_kernel_impl.h new file mode 100644 index 0000000000000..dffb81fbea4e3 --- /dev/null +++ b/paddle/phi/kernels/impl/fill_grad_kernel_impl.h @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/fill_grad_kernel.h" + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void FillGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const Scalar& value, + DenseTensor* in_grad) { + if (in_grad) { + dev_ctx.template Alloc(in_grad); + + phi::funcs::SetConstant functor; + functor(dev_ctx, in_grad, T(0)); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/fill_kernel_impl.h b/paddle/phi/kernels/impl/fill_kernel_impl.h new file mode 100644 index 0000000000000..7d10ea42bd6b6 --- /dev/null +++ b/paddle/phi/kernels/impl/fill_kernel_impl.h @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/fill_kernel.h" + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void FillKernel(const Context& dev_ctx, + const DenseTensor& x, + const Scalar& value, + DenseTensor* out) { + T fill_var = value.to(); + + PADDLE_ENFORCE_EQ(std::isnan(static_cast(fill_var)), + false, + phi::errors::InvalidArgument("fill value should not be NaN," + " but received NaN")); + + dev_ctx.template Alloc(out); + + phi::funcs::SetConstant functor; + functor(dev_ctx, out, fill_var); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/fold_grad_kernel_impl.h b/paddle/phi/kernels/impl/fold_grad_kernel_impl.h new file mode 100644 index 0000000000000..b9320eab85046 --- /dev/null +++ b/paddle/phi/kernels/impl/fold_grad_kernel_impl.h @@ -0,0 +1,75 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/unfold_functor.h" + +namespace phi { + +template +void FoldGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& output_sizes, + const std::vector& kernel_sizes, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + DenseTensor* x_grad) { + ctx.template Alloc(x_grad); + + if (!x_grad) return; + + const auto& x_dims = x_grad->dims(); + const int batch_size = static_cast(x_dims[0]); + + int output_height = (output_sizes[0] + 2 * paddings[0] - + (dilations[0] * (kernel_sizes[0] - 1) + 1)) / + strides[0] + + 1; + int output_width = (output_sizes[1] + 2 * paddings[1] - + (dilations[1] * (kernel_sizes[1] - 1) + 1)) / + strides[1] + + 1; + + int n_input_plane = x_dims[1]; + int n_output_plane = n_input_plane / (kernel_sizes[0] * kernel_sizes[1]); + + DDim out_shape = + make_ddim({n_output_plane, output_sizes[0], output_sizes[1]}); + DDim input_matrix_shape = make_ddim({x_dims[0], + kernel_sizes[0], + kernel_sizes[1], + output_height, + output_width}); + + paddle::operators::math:: + Im2ColFunctor + im2col; + + for (int i = 0; i < batch_size; i++) { + DenseTensor out_grad_batch = out_grad.Slice(i, i + 1).Resize(out_shape); + DenseTensor x_grad_batch = + x_grad->Slice(i, i + 1).Resize(input_matrix_shape); + im2col(ctx, out_grad_batch, dilations, strides, paddings, &x_grad_batch); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/fold_kernel_impl.h b/paddle/phi/kernels/impl/fold_kernel_impl.h new file mode 100644 index 0000000000000..415beca7bd928 --- /dev/null +++ b/paddle/phi/kernels/impl/fold_kernel_impl.h @@ -0,0 +1,77 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/unfold_functor.h" + +namespace phi { + +template +void FoldKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& output_sizes, + const std::vector& kernel_sizes, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + DenseTensor* out) { + const int batch_size = static_cast(x.dims()[0]); + ctx.template Alloc(out); + + paddle::operators::math:: + Col2ImFunctor + col2im; + const auto& x_dims = x.dims(); + + int output_height = (output_sizes[0] + 2 * paddings[0] - + (dilations[0] * (kernel_sizes[0] - 1) + 1)) / + strides[0] + + 1; + int output_width = (output_sizes[1] + 2 * paddings[1] - + (dilations[1] * (kernel_sizes[1] - 1) + 1)) / + strides[1] + + 1; + + int n_input_plane = x_dims[1]; + int n_output_plane = n_input_plane / (kernel_sizes[0] * kernel_sizes[1]); + + DDim output_shape = + make_ddim({n_output_plane, output_sizes[0], output_sizes[1]}); + + DDim input_matrix_shape = make_ddim({x_dims[0], + kernel_sizes[0], + kernel_sizes[1], + output_height, + output_width}); + + phi::funcs::SetConstant set_zero; + set_zero(ctx, out, static_cast(0)); + + for (int i = 0; i < batch_size; i++) { + DenseTensor out_batch = + out->Slice(i, i + 1).Resize(output_shape); // im size=3 + DenseTensor in_batch = + x.Slice(i, i + 1).Resize(input_matrix_shape); // col size=5 + col2im(ctx, in_batch, dilations, strides, paddings, &out_batch); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/lamb_kernel_impl.h b/paddle/phi/kernels/impl/lamb_kernel_impl.h new file mode 100644 index 0000000000000..f3a76c6a7f1dd --- /dev/null +++ b/paddle/phi/kernels/impl/lamb_kernel_impl.h @@ -0,0 +1,296 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/lamb_functors.h" + +namespace phi { + +template +void ComputeImpl(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& lr, + const DenseTensor& mom1, + const DenseTensor& mom2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + const paddle::optional& master_param_opt, + const paddle::optional& skip_update_opt, + float weight_decay_f, + float beta1_f, + float beta2_f, + float epsilon_f, + bool multi_precision, + DenseTensor* param_out, + DenseTensor* mom1_out, + DenseTensor* mom2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_out); + +template +void LambKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + const paddle::optional& master_param, + const paddle::optional& skip_update, + float weight_decay, + float beta1, + float beta2, + float epsilon, + bool multi_precision, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs) { + using MT = typename phi::dtype::MPTypeTrait::Type; + if (multi_precision) { + ComputeImpl(dev_ctx, + param, + grad, + learning_rate, + moment1, + moment2, + beta1_pow, + beta2_pow, + master_param, + skip_update, + weight_decay, + beta1, + beta2, + epsilon, + multi_precision, + param_out, + moment1_out, + moment2_out, + beta1_pow_out, + beta2_pow_out, + master_param_outs); + } else { + ComputeImpl(dev_ctx, + param, + grad, + learning_rate, + moment1, + moment2, + beta1_pow, + beta2_pow, + master_param, + skip_update, + weight_decay, + beta1, + beta2, + epsilon, + multi_precision, + param_out, + moment1_out, + moment2_out, + beta1_pow_out, + beta2_pow_out, + master_param_outs); + } +} + +template +void ComputeImpl(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& lr, + const DenseTensor& mom1, + const DenseTensor& mom2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + const paddle::optional& master_param_opt, + const paddle::optional& skip_update_opt, + float weight_decay_f, + float beta1_f, + float beta2_f, + float epsilon_f, + bool multi_precision, + DenseTensor* param_out, + DenseTensor* mom1_out, + DenseTensor* mom2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_out) { + if (!IsMultiPrecision) { + constexpr auto kIsSameType = std::is_same::value; + PADDLE_ENFORCE_EQ( + kIsSameType, + true, + phi::errors::InvalidArgument( + "When multi_precision=False, T and MT must be the same type.")); + } + + const auto* master_param = + IsMultiPrecision ? master_param_opt.get_ptr() : nullptr; + const auto* skip_update = skip_update_opt.get_ptr(); + const bool* skip_update_flag = skip_update && skip_update->IsInitialized() + ? skip_update->data() + : nullptr; + if (skip_update_flag && + paddle::platform::is_cpu_place(skip_update->place()) && + (*skip_update_flag)) { + return; + } + + auto weight_decay = static_cast(weight_decay_f); + auto beta1 = static_cast(beta1_f); + auto beta2 = static_cast(beta2_f); + auto epsilon = static_cast(epsilon_f); + auto numel = param.numel(); + phi::funcs::ForRange for_range(dev_ctx, numel); + DenseTensor trust_ratio_div; + trust_ratio_div.Resize(param.dims()); + auto* trust_ratio_div_ptr = dev_ctx.template Alloc(&trust_ratio_div); + + const void* param_ptr = param.data(); + const void* master_param_ptr = master_param ? master_param->data() : nullptr; + void* param_out_ptr = dev_ctx.template Alloc(param_out); + void* master_param_out_ptr = + master_param_out ? dev_ctx.template Alloc(master_param_out) : nullptr; + // Update moments + bool should_update_beta_pow_later = false; + const MT *beta1_pow_ptr = nullptr, *beta2_pow_ptr = nullptr; + MT *beta1_pow_out_ptr = nullptr, *beta2_pow_out_ptr = nullptr; + VLOG(10) << "Beta1Pow place: " << beta1_pow.place() + << " , Beta2Pow place: " << beta2_pow.place(); + // Diff from here + + if (paddle::platform::is_gpu_place(dev_ctx.GetPlace()) && + beta1_pow.place() == phi::CPUPlace() && + beta2_pow.place() == phi::CPUPlace()) { + LambMomentREGUpdateFunctor moment_update_functor( + weight_decay, + beta1, + beta2, + epsilon, + *beta1_pow.template data(), + *beta2_pow.template data(), + mom1.template data(), + dev_ctx.template Alloc(mom1_out), + mom2.template data(), + dev_ctx.template Alloc(mom2_out), + grad.template data(), + static_cast(IsMultiPrecision ? master_param_ptr : param_ptr), + trust_ratio_div_ptr, + skip_update_flag); + for_range(moment_update_functor); + MT* beta1_pow_out_data = dev_ctx.template HostAlloc(beta1_pow_out); + beta1_pow_out_data[0] = beta1 * beta1_pow.template data()[0]; + MT* beta2_pow_out_data = dev_ctx.template HostAlloc(beta2_pow_out); + beta2_pow_out_data[0] = beta2 * beta2_pow.template data()[0]; + } else { + beta1_pow_ptr = beta1_pow.template data(); + beta2_pow_ptr = beta2_pow.template data(); + beta1_pow_out_ptr = dev_ctx.template Alloc(beta1_pow_out); + beta2_pow_out_ptr = dev_ctx.template Alloc(beta2_pow_out); + should_update_beta_pow_later = true; + LambMomentMENUpdateFunctor moment_update_functor( + weight_decay, + beta1, + beta2, + epsilon, + static_cast(beta1_pow_ptr), + static_cast(beta2_pow_ptr), + mom1.template data(), + dev_ctx.template Alloc(mom1_out), + mom2.template data(), + dev_ctx.template Alloc(mom2_out), + grad.template data(), + static_cast(IsMultiPrecision ? master_param_ptr : param_ptr), + trust_ratio_div_ptr, + skip_update_flag); + for_range(moment_update_functor); + } + + // Same from here + // Update parameter + // The code in the following part is exactly the same as that in + // paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h Please modify it + // together + DenseTensor p_norm_t; + p_norm_t.Resize(phi::make_ddim({1})); + auto* p_norm_ptr = dev_ctx.template Alloc(&p_norm_t); + + DenseTensor trust_ratio_div_norm_t; + trust_ratio_div_norm_t.Resize(phi::make_ddim({1})); + auto* trust_ratio_div_norm_ptr = + dev_ctx.template Alloc(&trust_ratio_div_norm_t); + + // TODO(zengjinle): remove the following Eigen operations when + // *skip_update == true. + paddle::memory::Buffer buffer(dev_ctx.GetPlace()); + phi::funcs::SquaredL2Norm( + dev_ctx, + reinterpret_cast(IsMultiPrecision ? master_param_ptr + : param_ptr), + p_norm_ptr, + numel, + &buffer); + phi::funcs::SquaredL2Norm( + dev_ctx, trust_ratio_div_ptr, trust_ratio_div_norm_ptr, numel, &buffer); + + if (VLOG_IS_ON(1)) { + const auto& name = "Param"; + auto pn = phi::funcs::ToVector(p_norm_ptr, 1, dev_ctx.GetPlace()); + auto tn = + phi::funcs::ToVector(trust_ratio_div_norm_ptr, 1, dev_ctx.GetPlace()); + auto dtype = paddle::framework::DataTypeToString( + paddle::framework::DataTypeTrait::DataType()); + VLOG(1) << "Param " << dtype << " " << name << " pn = " << pn[0] + << " , tn = " << tn[0]; + } + +#define CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC(__should_update_beta_pow) \ + do { \ + LambParamUpateFunctor \ + param_update_functor(lr.template data(), \ + static_cast(param_ptr), \ + static_cast(master_param_ptr), \ + p_norm_ptr, \ + trust_ratio_div_ptr, \ + trust_ratio_div_norm_ptr, \ + static_cast(param_out_ptr), \ + static_cast(master_param_out_ptr), \ + skip_update_flag); \ + if (__should_update_beta_pow) { \ + param_update_functor.SetBetaPows(beta1_pow_ptr, \ + beta2_pow_ptr, \ + beta1_pow_out_ptr, \ + beta2_pow_out_ptr, \ + beta1, \ + beta2); \ + } \ + for_range(param_update_functor); \ + } while (0) + + if (should_update_beta_pow_later) { + CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC(true); + } else { + CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC(false); + } + +#undef CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/reduce_grad.h b/paddle/phi/kernels/impl/reduce_grad.h index 8dcd3c2ba8c60..40b62cc83fa73 100644 --- a/paddle/phi/kernels/impl/reduce_grad.h +++ b/paddle/phi/kernels/impl/reduce_grad.h @@ -91,6 +91,9 @@ void ReduceGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + if (dims.size() == 0) { + reduce_all = true; + } if (x.dtype() != out_grad.dtype()) { DenseTensorMeta x_grad_meta( out_grad.dtype(), x_grad->dims(), x_grad->layout()); diff --git a/paddle/phi/kernels/kps/compare_kernel.cu b/paddle/phi/kernels/kps/compare_kernel.cu index 0b0990627f0be..b981d802255a2 100644 --- a/paddle/phi/kernels/kps/compare_kernel.cu +++ b/paddle/phi/kernels/kps/compare_kernel.cu @@ -113,7 +113,8 @@ PD_REGISTER_KERNEL(less_than, int, int64_t, float, - double) {} + double, + phi::dtype::float16) {} PD_REGISTER_KERNEL(less_equal, KPS, ALL_LAYOUT, @@ -123,7 +124,8 @@ PD_REGISTER_KERNEL(less_equal, int, int64_t, float, - double) {} + double, + phi::dtype::float16) {} PD_REGISTER_KERNEL(greater_than, KPS, ALL_LAYOUT, @@ -133,7 +135,8 @@ PD_REGISTER_KERNEL(greater_than, int, int64_t, float, - double) {} + double, + phi::dtype::float16) {} PD_REGISTER_KERNEL(greater_equal, KPS, ALL_LAYOUT, @@ -143,7 +146,8 @@ PD_REGISTER_KERNEL(greater_equal, int, int64_t, float, - double) {} + double, + phi::dtype::float16) {} PD_REGISTER_KERNEL(equal, KPS, ALL_LAYOUT, @@ -153,7 +157,8 @@ PD_REGISTER_KERNEL(equal, int, int64_t, float, - double) {} + double, + phi::dtype::float16) {} PD_REGISTER_KERNEL(not_equal, KPS, ALL_LAYOUT, @@ -163,7 +168,8 @@ PD_REGISTER_KERNEL(not_equal, int, int64_t, float, - double) {} + double, + phi::dtype::float16) {} PD_REGISTER_KERNEL(equal_all, KPS, diff --git a/paddle/phi/kernels/lamb_kernel.h b/paddle/phi/kernels/lamb_kernel.h new file mode 100644 index 0000000000000..f69948453d9b6 --- /dev/null +++ b/paddle/phi/kernels/lamb_kernel.h @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LambKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + const paddle::optional& master_param, + const paddle::optional& skip_update, + float weight_decay, + float beta1, + float beta2, + float epsilon, + bool multi_precision, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs); + +} // namespace phi diff --git a/paddle/phi/kernels/margin_cross_entropy_grad_kernel.h b/paddle/phi/kernels/margin_cross_entropy_grad_kernel.h new file mode 100644 index 0000000000000..2d0715149751e --- /dev/null +++ b/paddle/phi/kernels/margin_cross_entropy_grad_kernel.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +namespace phi { +template +void MarginCrossEntropyGradKernel(const Context& dev_ctx, + const DenseTensor& logits, + const DenseTensor& label, + const DenseTensor& softmax, + const DenseTensor& loss_grad, + bool return_softmax, + int ring_id, + int rank, + int nranks, + float margin1, + float margin2, + float margin3, + float scale, + DenseTensor* logits_grad); +} // namespace phi diff --git a/paddle/phi/kernels/margin_cross_entropy_kernel.h b/paddle/phi/kernels/margin_cross_entropy_kernel.h new file mode 100644 index 0000000000000..df58256597695 --- /dev/null +++ b/paddle/phi/kernels/margin_cross_entropy_kernel.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MarginCrossEntropyKernel(const Context& dev_ctx, + const DenseTensor& logits, + const DenseTensor& label, + bool return_softmax, + int ring_id, + int rank, + int nranks, + float margin1, + float margin2, + float margin3, + float scale, + DenseTensor* softmax, + DenseTensor* loss); +} // namespace phi diff --git a/paddle/phi/kernels/onednn/activation_grad_kernel.cc b/paddle/phi/kernels/onednn/activation_grad_kernel.cc new file mode 100644 index 0000000000000..2eff072e647fc --- /dev/null +++ b/paddle/phi/kernels/onednn/activation_grad_kernel.cc @@ -0,0 +1,251 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/activation_grad_kernel.h" + +#include "paddle/phi/backends/onednn/onednn_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/activation_functor.h" +#include "paddle/phi/kernels/funcs/onednn/mkldnn_reuse.h" + +namespace phi { + +#define DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + DenseTensor* dx) { \ + functor_class functor; \ + functor(dev_ctx, x, dout, 0, 0, dx); \ + } + +#define DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + functor_class functor; \ + functor(dev_ctx, x, dout, attr, 0, dx); \ + } + +#define DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + DenseTensor* dx) { \ + functor_class functor; \ + functor(dev_ctx, out, dout, 0, 0, dx); \ + } + +#define DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + functor_class functor; \ + functor(dev_ctx, out, dout, attr, 0, dx); \ + } + +template +void eltwise_grad(const OneDNNContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + float alpha, + float beta, + DenseTensor* dx, + dnnl::algorithm algorithm) { + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + funcs::ActivationMKLDNNHandler handler( + algorithm, alpha, beta, mkldnn_engine, dev_ctx.GetPlace(), &x, &dout); + + auto src_memory_p = handler.AcquireBackwardSrcMemory(&x); + auto diff_dst_memory_p = handler.AcquireDiffDstMemory(&dout); + auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx); + auto activation_backward_p = handler.AcquireBackwardPrimitive(); + + auto& astream = OneDNNContext::tls().get_stream(); + activation_backward_p->execute(astream, + {{DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_DIFF_DST, *diff_dst_memory_p}, + {DNNL_ARG_DIFF_SRC, *diff_src_memory_p}}); + astream.wait(); + + dx->set_mem_desc(diff_src_memory_p->get_desc()); +} + +template +void eltwise_grad_use_out(const OneDNNContext& dev_ctx, + const DenseTensor& out, + const DenseTensor& dout, + float alpha, + float beta, + DenseTensor* dx, + dnnl::algorithm algorithm) { + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + funcs::ActivationMKLDNNHandler handler( + algorithm, alpha, beta, mkldnn_engine, dev_ctx.GetPlace(), &out, &dout); + + auto dst_memory_p = handler.AcquireBackwardSrcMemory(&out); + auto diff_dst_memory_p = handler.AcquireDiffDstMemory(&dout); + auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx); + auto activation_backward_p = handler.AcquireBackwardPrimitive(); + + auto& astream = OneDNNContext::tls().get_stream(); + activation_backward_p->execute(astream, + {{DNNL_ARG_DST, *dst_memory_p}, + {DNNL_ARG_DIFF_DST, *diff_dst_memory_p}, + {DNNL_ARG_DIFF_SRC, *diff_src_memory_p}}); + astream.wait(); + + dx->set_mem_desc(diff_src_memory_p->get_desc()); +} + +template +struct MKLDNNActivationGradFunc : public funcs::BaseActivationFunctor { + void operator()(const OneDNNContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + float alpha, + float beta, + DenseTensor* dx) const { + eltwise_grad(dev_ctx, x, dout, alpha, beta, dx, algorithm); + } +}; + +template +struct MKLDNNActivationGradUseOutFunc : public funcs::BaseActivationFunctor { + void operator()(const OneDNNContext& dev_ctx, + const DenseTensor& out, + const DenseTensor& dout, + float alpha, + float beta, + DenseTensor* dx) const { + eltwise_grad_use_out(dev_ctx, out, dout, alpha, beta, dx, algorithm); + } +}; + +template +using ReluMKLDNNGradFunctor = + MKLDNNActivationGradFunc; + +template +using SwishMKLDNNGradFunctor = + MKLDNNActivationGradFunc; + +template +using HardSwishMKLDNNGradFunctor = + MKLDNNActivationGradFunc; + +template +using MishMKLDNNGradFunctor = + MKLDNNActivationGradFunc; + +template +using SigmoidMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< + T, + dnnl::algorithm::eltwise_logistic_use_dst_for_bwd>; + +template +using TanhMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< + T, + dnnl::algorithm::eltwise_tanh_use_dst_for_bwd>; + +template +using SqrtMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< + T, + dnnl::algorithm::eltwise_sqrt_use_dst_for_bwd>; + +template +using EluMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< + T, + dnnl::algorithm::eltwise_elu_use_dst_for_bwd>; + +template +using ExpMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc< + T, + dnnl::algorithm::eltwise_exp_use_dst_for_bwd>; + +DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhMKLDNNGradUseOutFunctor); +DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, SqrtMKLDNNGradUseOutFunctor); +DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, + SigmoidMKLDNNGradUseOutFunctor); +DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, ExpMKLDNNGradUseOutFunctor); +DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluMKLDNNGradFunctor); + +DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, + ReluMKLDNNGradFunctor, + alpha); +DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, + MishMKLDNNGradFunctor, + threshold); +DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, + SwishMKLDNNGradFunctor, + beta); +template +void HardSwishGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + float threshold, + float scale, + float offset, + DenseTensor* dx) { + HardSwishMKLDNNGradFunctor functor; + functor(dev_ctx, x, dout, threshold, 0, dx); +} + +template +void EluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + float alpha, + DenseTensor* dx) { + EluMKLDNNGradUseOutFunctor functor; + functor(dev_ctx, out, dout, alpha, 0, dx); +} + +} // namespace phi + +PD_REGISTER_KERNEL(relu_grad, + OneDNN, + ALL_LAYOUT, + phi::ReluGradKernel, + float, + phi::dtype::bfloat16) {} + +#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \ + PD_REGISTER_KERNEL( \ + name, OneDNN, ALL_LAYOUT, phi::func, float, phi::dtype::bfloat16) {} + +PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(exp_grad, ExpGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_swish_grad, HardSwishGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(swish_grad, SwishGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel) diff --git a/paddle/phi/kernels/onednn/activation_kernel.cc b/paddle/phi/kernels/onednn/activation_kernel.cc new file mode 100644 index 0000000000000..fa0af71d399c3 --- /dev/null +++ b/paddle/phi/kernels/onednn/activation_kernel.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/activation_kernel.h" + +#include "paddle/phi/backends/onednn/onednn_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/activation_functor.h" +#include "paddle/phi/kernels/funcs/onednn/mkldnn_reuse.h" + +namespace phi { + +#define DEFINE_ONEDNN_ACTIVATION_KERNEL(name, functor_class) \ + template \ + void name##Kernel( \ + const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ + functor_class functor; \ + functor(dev_ctx, x, 0, 0, out); \ + } + +#define DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr, \ + DenseTensor* out) { \ + functor_class functor; \ + functor(dev_ctx, x, attr, 0, out); \ + } + +template +void EltwiseForward(const OneDNNContext& dev_ctx, + const DenseTensor& x, + float alpha, + float beta, + DenseTensor* out, + dnnl::algorithm algorithm) { + PADDLE_ENFORCE_EQ(paddle::platform::is_cpu_place(dev_ctx.GetPlace()), + true, + phi::errors::PreconditionNotMet( + "Operator DNNL eletwise_forward must use ONEDNNPlace")); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + bool is_inplaced = x.IsSharedBufferWith(*out); + + funcs::ActivationMKLDNNHandler handler( + algorithm, alpha, beta, mkldnn_engine, dev_ctx.GetPlace(), &x); + + auto src_memory_p = handler.AcquireSrcMemory(&x); + std::shared_ptr dst_memory_p = nullptr; + if (is_inplaced) { + dst_memory_p = src_memory_p; + dev_ctx.template Alloc(out); + } else { + dst_memory_p = handler.AcquireDstMemory(out); + } + auto activation_p = handler.AcquireForwardPrimitive(); + + auto& astream = OneDNNContext::tls().get_stream(); + activation_p->execute( + astream, {{DNNL_ARG_FROM, *src_memory_p}, {DNNL_ARG_TO, *dst_memory_p}}); + astream.wait(); + + out->set_mem_desc(dst_memory_p->get_desc()); +} + +template +struct MKLDNNActivationFunc : public funcs::BaseActivationFunctor { + void operator()(const OneDNNContext& dev_ctx, + const DenseTensor& x, + float alpha, + float beta, + DenseTensor* out) const { + EltwiseForward(dev_ctx, x, alpha, beta, out, algorithm); + } +}; + +template +using ReluMKLDNNFunctor = + MKLDNNActivationFunc; + +template +using SwishMKLDNNFunctor = + MKLDNNActivationFunc; + +template +using HardSwishMKLDNNFunctor = + MKLDNNActivationFunc; + +template +using MishMKLDNNFunctor = + MKLDNNActivationFunc; + +template +using SigmoidMKLDNNFunctor = + MKLDNNActivationFunc; + +template +using TanhMKLDNNFunctor = + MKLDNNActivationFunc; + +template +using SqrtMKLDNNFunctor = + MKLDNNActivationFunc; + +template +using EluMKLDNNFunctor = MKLDNNActivationFunc; + +template +using ExpMKLDNNFunctor = MKLDNNActivationFunc; + +template +using RoundMKLDNNFunctor = + MKLDNNActivationFunc; + +DEFINE_ONEDNN_ACTIVATION_KERNEL(Relu, ReluMKLDNNFunctor) +DEFINE_ONEDNN_ACTIVATION_KERNEL(Tanh, TanhMKLDNNFunctor) +DEFINE_ONEDNN_ACTIVATION_KERNEL(Exp, ExpMKLDNNFunctor) +DEFINE_ONEDNN_ACTIVATION_KERNEL(Sqrt, SqrtMKLDNNFunctor) +DEFINE_ONEDNN_ACTIVATION_KERNEL(Sigmoid, SigmoidMKLDNNFunctor) +// round eltwise primitive doesn't support BF16, nor does it support grad +DEFINE_ONEDNN_ACTIVATION_KERNEL(Round, RoundMKLDNNFunctor) + +DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, ReluMKLDNNFunctor, alpha) +DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(Mish, MishMKLDNNFunctor, threshold) +DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(Elu, EluMKLDNNFunctor, alpha) +DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(Swish, SwishMKLDNNFunctor, beta) + +template +void HardSwishKernel(const Context& dev_ctx, + const DenseTensor& x, + float threshold, + float scale, + float offset, + DenseTensor* out) { + HardSwishMKLDNNFunctor functor; + functor(dev_ctx, x, threshold, 0, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(round, OneDNN, ALL_LAYOUT, phi::RoundKernel, float) {} + +#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \ + PD_REGISTER_KERNEL( \ + name, OneDNN, ALL_LAYOUT, phi::func, float, phi::dtype::bfloat16) {} + +PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel) +PD_REGISTER_ACTIVATION_KERNEL(exp, ExpKernel) +PD_REGISTER_ACTIVATION_KERNEL(hard_swish, HardSwishKernel) +PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel) +PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel) +PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel) +PD_REGISTER_ACTIVATION_KERNEL(swish, SwishKernel) +PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel) +PD_REGISTER_ACTIVATION_KERNEL(relu, ReluKernel) diff --git a/paddle/phi/kernels/reduce_amax_kernel.cc b/paddle/phi/kernels/reduce_amax_kernel.cc index acec25d83db6a..47b5e97467fe7 100644 --- a/paddle/phi/kernels/reduce_amax_kernel.cc +++ b/paddle/phi/kernels/reduce_amax_kernel.cc @@ -26,6 +26,9 @@ void AMaxKernel(const Context& dev_ctx, bool keep_dim, DenseTensor* out) { bool reduce_all = false; + if (dims.size() == 0) { + reduce_all = true; + } AMaxRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); } diff --git a/paddle/phi/kernels/reduce_amin_kernel.cc b/paddle/phi/kernels/reduce_amin_kernel.cc index 28e6e587f4020..8da4f3afd9f43 100644 --- a/paddle/phi/kernels/reduce_amin_kernel.cc +++ b/paddle/phi/kernels/reduce_amin_kernel.cc @@ -26,6 +26,9 @@ void AMinKernel(const Context& dev_ctx, bool keep_dim, DenseTensor* out) { bool reduce_all = false; + if (dims.size() == 0) { + reduce_all = true; + } AMinRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); } diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/reduce_max_kernel.cc index 26b8bc196ccd4..7bdf9ba2bbcc6 100644 --- a/paddle/phi/kernels/reduce_max_kernel.cc +++ b/paddle/phi/kernels/reduce_max_kernel.cc @@ -26,6 +26,9 @@ void MaxKernel(const Context& dev_ctx, bool keep_dim, DenseTensor* out) { bool reduce_all = false; + if (dims.size() == 0) { + reduce_all = true; + } MaxRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); } diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc index 75d906aa4bd75..69725759e4e82 100644 --- a/paddle/phi/kernels/reduce_min_kernel.cc +++ b/paddle/phi/kernels/reduce_min_kernel.cc @@ -26,6 +26,9 @@ void MinKernel(const Context& dev_ctx, bool keep_dim, DenseTensor* out) { bool reduce_all = false; + if (dims.size() == 0) { + reduce_all = true; + } MinRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); } diff --git a/paddle/phi/kernels/reduce_sum_kernel.cc b/paddle/phi/kernels/reduce_sum_kernel.cc index 0d79fa34bc274..c9622768c45d9 100644 --- a/paddle/phi/kernels/reduce_sum_kernel.cc +++ b/paddle/phi/kernels/reduce_sum_kernel.cc @@ -27,6 +27,9 @@ void SumKernel(const Context& dev_ctx, bool keep_dim, DenseTensor* out) { bool reduce_all = false; + if (dims.size() == 0) { + reduce_all = true; + } SumRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out); } diff --git a/paddle/phi/kernels/selected_rows/cpu/lamb_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/lamb_kernel.cc new file mode 100644 index 0000000000000..e30c0cf970477 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/cpu/lamb_kernel.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/lamb_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h" + +PD_REGISTER_KERNEL( + lamb_sr, CPU, ALL_LAYOUT, phi::sr::LambKernel, float, double) {} diff --git a/paddle/phi/kernels/selected_rows/gpu/lamb_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/lamb_kernel.cu new file mode 100644 index 0000000000000..b76d116f7f63f --- /dev/null +++ b/paddle/phi/kernels/selected_rows/gpu/lamb_kernel.cu @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/lamb_kernel.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h" + +PD_REGISTER_KERNEL(lamb_sr, + GPU, + ALL_LAYOUT, + phi::sr::LambKernel, + phi::dtype::float16, + float, + double) { + kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); +} diff --git a/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h b/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h new file mode 100644 index 0000000000000..5623d0dbdbe69 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h @@ -0,0 +1,351 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/selected_rows.h" +#include "paddle/phi/kernels/funcs/lamb_functors.h" + +namespace phi { +namespace sr { + +template +void ComputeRowImpl(const Context& dev_ctx, + const DenseTensor& param, + const SelectedRows& grad, + const DenseTensor& lr, + const DenseTensor& mom1, + const DenseTensor& mom2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + const paddle::optional& master_param_opt, + const paddle::optional& skip_update_opt, + float weight_decay_f, + float beta1_f, + float beta2_f, + float epsilon_f, + bool multi_precision, + DenseTensor* param_out, + DenseTensor* mom1_out, + DenseTensor* mom2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_out); + +template +void LambKernel(const Context& dev_ctx, + const DenseTensor& param, + const SelectedRows& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + const paddle::optional& master_param, + const paddle::optional& skip_update, + float weight_decay, + float beta1, + float beta2, + float epsilon, + bool multi_precision, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs) { + using MT = typename phi::dtype::MPTypeTrait::Type; + if (multi_precision) { + ComputeRowImpl(dev_ctx, + param, + grad, + learning_rate, + moment1, + moment2, + beta1_pow, + beta2_pow, + master_param, + skip_update, + weight_decay, + beta1, + beta2, + epsilon, + multi_precision, + param_out, + moment1_out, + moment2_out, + beta1_pow_out, + beta2_pow_out, + master_param_outs); + } else { + ComputeRowImpl(dev_ctx, + param, + grad, + learning_rate, + moment1, + moment2, + beta1_pow, + beta2_pow, + master_param, + skip_update, + weight_decay, + beta1, + beta2, + epsilon, + multi_precision, + param_out, + moment1_out, + moment2_out, + beta1_pow_out, + beta2_pow_out, + master_param_outs); + } +} + +template +void ComputeRowImpl(const Context& dev_ctx, + const DenseTensor& param, + const SelectedRows& grad, + const DenseTensor& lr, + const DenseTensor& mom1, + const DenseTensor& mom2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + const paddle::optional& master_param_opt, + const paddle::optional& skip_update_opt, + float weight_decay_f, + float beta1_f, + float beta2_f, + float epsilon_f, + bool multi_precision, + DenseTensor* param_out, + DenseTensor* mom1_out, + DenseTensor* mom2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_out) { + if (!IsMultiPrecision) { + constexpr auto kIsSameType = std::is_same::value; + PADDLE_ENFORCE_EQ( + kIsSameType, + true, + phi::errors::InvalidArgument( + "When multi_precision=False, T and MT must be the same type.")); + } + + const auto* master_param = + IsMultiPrecision ? master_param_opt.get_ptr() : nullptr; + const auto* skip_update = skip_update_opt.get_ptr(); + const bool* skip_update_flag = skip_update && skip_update->IsInitialized() + ? skip_update->data() + : nullptr; + if (skip_update_flag && + paddle::platform::is_cpu_place(skip_update->place()) && + (*skip_update_flag)) { + return; + } + + auto weight_decay = static_cast(weight_decay_f); + auto beta1 = static_cast(beta1_f); + auto beta2 = static_cast(beta2_f); + auto epsilon = static_cast(epsilon_f); + auto numel = param.numel(); + phi::funcs::ForRange for_range(dev_ctx, numel); + DenseTensor trust_ratio_div; + trust_ratio_div.Resize(param.dims()); + /*auto trust_ratio_div = + ctx.AllocateTmpTensor(param.dims(), dev_ctx);*/ + auto* trust_ratio_div_ptr = dev_ctx.template Alloc(&trust_ratio_div); + + const void* param_ptr = param.data(); + const void* master_param_ptr = master_param ? master_param->data() : nullptr; + void* param_out_ptr = dev_ctx.template Alloc(param_out); + void* master_param_out_ptr = + master_param_out ? dev_ctx.template Alloc(master_param_out) : nullptr; + // Update moments + bool should_update_beta_pow_later = false; + const MT *beta1_pow_ptr = nullptr, *beta2_pow_ptr = nullptr; + MT *beta1_pow_out_ptr = nullptr, *beta2_pow_out_ptr = nullptr; + VLOG(10) << "Beta1Pow place: " << beta1_pow.place() + << " , Beta2Pow place: " << beta2_pow.place(); + // Diff from here + PADDLE_ENFORCE_EQ( + IsMultiPrecision, + false, + phi::errors::Unimplemented("SelectedRows gradient is not supported when " + "multi_precision=True.")); + constexpr bool kIsSameType = std::is_same::value; + PADDLE_ENFORCE_EQ( + kIsSameType, + true, + phi::errors::Unimplemented("SelectedRows gradient is not supported when " + "multi_precision=True.")); + if (grad.rows().size() == 0) { + VLOG(3) << "grad row size is 0!!"; + return; + } + + std::vector cpu_rows(grad.rows().begin(), grad.rows().end()); + bool is_strict_sorted = true; + for (size_t i = 1; i < cpu_rows.size(); ++i) { + if (cpu_rows[i - 1] >= cpu_rows[i]) { + is_strict_sorted = false; + break; + } + } + + phi::SelectedRows tmp_grad_merge; + const phi::SelectedRows* grad_merge_ptr; + if (is_strict_sorted) { + grad_merge_ptr = &grad; + } else { + // merge duplicated rows if any. + // The rows of grad_merge have been sorted inside MergeAdd functor + paddle::operators::math::scatter::MergeAdd merge_func; + merge_func(dev_ctx, grad, &tmp_grad_merge, true); + grad_merge_ptr = &tmp_grad_merge; + } + + auto& grad_merge = *grad_merge_ptr; + auto& grad_tensor = grad_merge.value(); + const T* grad_data = grad_tensor.template data(); + auto* grad_merge_rows = &grad_merge.rows(); + paddle::framework::MixVector mixv_grad_merge_rows(grad_merge_rows); + const int64_t* rows = mixv_grad_merge_rows.Data(dev_ctx.GetPlace()); + auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); + if (paddle::platform::is_gpu_place(dev_ctx.GetPlace()) && + beta1_pow.place() == phi::CPUPlace() && + beta2_pow.place() == phi::CPUPlace()) { + SparseLambMomentREGUpdateFunctor moment_update_functor( + static_cast(weight_decay), + static_cast(beta1), + static_cast(beta2), + static_cast(epsilon), + *beta1_pow.template data(), + *beta2_pow.template data(), + mom1.template data(), + dev_ctx.template Alloc(mom1_out), + mom2.template data(), + dev_ctx.template Alloc(mom2_out), + grad_data, + param.template data(), + trust_ratio_div.template data(), + rows, + row_numel, + grad_merge.rows().size(), + skip_update_flag); + for_range(moment_update_functor); + T* beta1_pow_out_data = dev_ctx.template HostAlloc(beta1_pow_out); + beta1_pow_out_data[0] = + static_cast(beta1) * beta1_pow.template data()[0]; + T* beta2_pow_out_data = dev_ctx.template HostAlloc(beta2_pow_out); + beta2_pow_out_data[0] = + static_cast(beta2) * beta2_pow.template data()[0]; + } else { + beta1_pow_ptr = beta1_pow.template data(); + beta2_pow_ptr = beta2_pow.template data(); + beta1_pow_out_ptr = dev_ctx.template Alloc(beta1_pow_out); + beta2_pow_out_ptr = dev_ctx.template Alloc(beta2_pow_out); + should_update_beta_pow_later = true; + SparseLambMomentMENUpdateFunctor moment_update_functor( + static_cast(weight_decay), + static_cast(beta1), + static_cast(beta2), + static_cast(epsilon), + reinterpret_cast(beta1_pow_ptr), + reinterpret_cast(beta2_pow_ptr), + mom1.template data(), + dev_ctx.template Alloc(mom1_out), + mom2.template data(), + dev_ctx.template Alloc(mom2_out), + grad_data, + param.template data(), + trust_ratio_div.template data(), + rows, + row_numel, + grad_merge.rows().size(), + skip_update_flag); + for_range(moment_update_functor); + } + // Same from here + // Update parameter + // The code in the following part is exactly the same as that in + // paddle/phi/kernels/impl/lamb_kernel_impl.h Please modify it together + DenseTensor p_norm_t; + p_norm_t.Resize(phi::make_ddim({1})); + auto* p_norm_ptr = dev_ctx.template Alloc(&p_norm_t); + + DenseTensor trust_ratio_div_norm_t; + trust_ratio_div_norm_t.Resize(phi::make_ddim({1})); + auto* trust_ratio_div_norm_ptr = + dev_ctx.template Alloc(&trust_ratio_div_norm_t); + + // TODO(zengjinle): remove the following Eigen operations when + // *skip_update == true. + paddle::memory::Buffer buffer(dev_ctx.GetPlace()); + phi::funcs::SquaredL2Norm( + dev_ctx, + reinterpret_cast(IsMultiPrecision ? master_param_ptr + : param_ptr), + p_norm_ptr, + numel, + &buffer); + phi::funcs::SquaredL2Norm( + dev_ctx, trust_ratio_div_ptr, trust_ratio_div_norm_ptr, numel, &buffer); + + if (VLOG_IS_ON(1)) { + const auto& name = "Param"; + auto pn = phi::funcs::ToVector(p_norm_ptr, 1, dev_ctx.GetPlace()); + auto tn = + phi::funcs::ToVector(trust_ratio_div_norm_ptr, 1, dev_ctx.GetPlace()); + auto dtype = paddle::framework::DataTypeToString( + paddle::framework::DataTypeTrait::DataType()); + VLOG(1) << "Param " << dtype << " " << name << " pn = " << pn[0] + << " , tn = " << tn[0]; + } + +#define CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC(__should_update_beta_pow) \ + do { \ + LambParamUpateFunctor \ + param_update_functor(lr.template data(), \ + static_cast(param_ptr), \ + static_cast(master_param_ptr), \ + p_norm_ptr, \ + trust_ratio_div_ptr, \ + trust_ratio_div_norm_ptr, \ + static_cast(param_out_ptr), \ + static_cast(master_param_out_ptr), \ + skip_update_flag); \ + if (__should_update_beta_pow) { \ + param_update_functor.SetBetaPows(beta1_pow_ptr, \ + beta2_pow_ptr, \ + beta1_pow_out_ptr, \ + beta2_pow_out_ptr, \ + beta1, \ + beta2); \ + } \ + for_range(param_update_functor); \ + } while (0) + + if (should_update_beta_pow_later) { + CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC(true); + } else { + CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC(false); + } + +#undef CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC +} + +} // namespace sr +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/lamb_kernel.h b/paddle/phi/kernels/selected_rows/lamb_kernel.h new file mode 100644 index 0000000000000..306f1ca0ff79b --- /dev/null +++ b/paddle/phi/kernels/selected_rows/lamb_kernel.h @@ -0,0 +1,46 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/selected_rows.h" + +namespace phi { +namespace sr { + +template +void LambKernel(const Context& dev_ctx, + const DenseTensor& param, + const SelectedRows& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow, + const DenseTensor& beta2_pow, + const paddle::optional& master_param, + const paddle::optional& skip_update, + float weight_decay, + float beta1, + float beta2, + float epsilon, + bool multi_precision, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_outs); + +} // namespace sr +} // namespace phi diff --git a/paddle/phi/kernels/shape_kernel.cc b/paddle/phi/kernels/shape_kernel.cc index ea48ea6171e6c..f35b7e544476e 100644 --- a/paddle/phi/kernels/shape_kernel.cc +++ b/paddle/phi/kernels/shape_kernel.cc @@ -48,7 +48,9 @@ PD_REGISTER_KERNEL(shape, float, double, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); +} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(shape, diff --git a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu index a8e88f351ccbc..c7e7849083a3e 100644 --- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu @@ -175,7 +175,7 @@ template void CoalesceKernel(const Context& dev_ctx, const SparseCooTensor& x, SparseCooTensor* out) { - PD_VISIT_INTEGRAL_TYPES( + PD_VISIT_BASE_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "CoalesceGPUKernel", ([&] { CoalesceGPUKernel(dev_ctx, x, out); })); diff --git a/paddle/phi/kernels/sparse/gpu/conv.cu.h b/paddle/phi/kernels/sparse/gpu/conv.cu.h index 859857ed7baac..d68145e958574 100644 --- a/paddle/phi/kernels/sparse/gpu/conv.cu.h +++ b/paddle/phi/kernels/sparse/gpu/conv.cu.h @@ -65,6 +65,7 @@ __global__ void GatherKernelV2(const T* inputs, const int* index_groups, const int non_zero_num, const int kernel_size, + const int max_voxel, const int channels, const int buffer_count, T* output) { @@ -82,10 +83,11 @@ __global__ void GatherKernelV2(const T* inputs, #pragma unroll for (int it = 0; it < buffer_count; it++) { int len = index_counts[indices_i + it * non_zero_num]; - const int group_offset = it * kernel_size * non_zero_num; + const int group_offset = it * kernel_size * max_voxel * non_zero_num; #pragma unroll for (int j = 0; j < len; j++) { - int out_i = index_groups[indices_i * kernel_size + j + group_offset]; + int out_i = index_groups[indices_i * kernel_size * max_voxel + j + + group_offset]; phi::Store( in_vec, output + out_i * channels + channels_i * VecSize); } @@ -127,6 +129,7 @@ inline void GatherV2(const GPUContext& dev_ctx, const int* index_groups, const int non_zero_num, const int kernel_size, + const int max_voxel, const int channels, const int buffer_count, T* output) { @@ -142,6 +145,7 @@ inline void GatherV2(const GPUContext& dev_ctx, index_groups, non_zero_num, kernel_size, + max_voxel, channels, buffer_count, output); @@ -156,6 +160,7 @@ inline void GatherV2(const GPUContext& dev_ctx, index_groups, non_zero_num, kernel_size, + max_voxel, channels, buffer_count, output); @@ -202,7 +207,7 @@ __global__ void UniqueKernel(const IntT* in_indexs, template __global__ void GroupIndexs(const int* out_index_table, const int n, - const int kernel_size, + const int offset, IntT* out_indexs, int* out_index_counts, int* out_index_groups) { @@ -214,7 +219,7 @@ __global__ void GroupIndexs(const int* out_index_table, // kernel_size at most int j = atomicAdd(out_index_counts + real_index, 1); // nnz * kernel_size - out_index_groups[real_index * kernel_size + j] = i; + out_index_groups[real_index * offset + j] = i; } } @@ -298,18 +303,36 @@ __global__ void ProductRuleBookKernel(const T* x_indices, } } -template +template __global__ void GetOutIndexTable(const IntT* indices, const IntT non_zero_num, const Dims4D dims, - int* out_index_table) { + int* out_index_table, + int* out_index_table2, + int* max_voxel) { + __shared__ int cache_max; + if (threadIdx.x == 0) { + cache_max = 0; + } + __syncthreads(); + CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) { IntT batch = indices[i]; IntT in_z = indices[i + non_zero_num]; IntT in_y = indices[i + 2 * non_zero_num]; IntT in_x = indices[i + 3 * non_zero_num]; IntT index = PointToIndex(batch, in_x, in_y, in_z, dims); - out_index_table[index] = i == 0 ? -1 : i; + if (save_out_index) { + out_index_table[index] = i == 0 ? -1 : i; + } + + int count = atomicAdd(out_index_table2 + index, 1); + atomicMax(&cache_max, count); + } + + __syncthreads(); + if (threadIdx.x == 0) { + atomicMax(max_voxel, cache_max + 1); } } @@ -318,10 +341,22 @@ __global__ void GetOutIndexTable(int* indexs, const int non_zero_num, const Dims4D out_dims, int* out_index_table, + int* out_index_table2, + int* max_voxel, IntT* out_indices) { + __shared__ int cache_max; + if (threadIdx.x == 0) { + cache_max = 0; + } + __syncthreads(); + CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) { IntT index = static_cast(indexs[i]); out_index_table[index] = i; + + int count = atomicAdd(out_index_table2 + index, 1); + atomicMax(&cache_max, count); + IntT batch, x, y, z; phi::funcs::sparse::IndexToPoint( index, out_dims, &batch, &x, &y, &z); @@ -332,6 +367,11 @@ __global__ void GetOutIndexTable(int* indexs, out_indices[i + non_zero_num * 3] = x; indexs[i] = 0; } + + __syncthreads(); + if (threadIdx.x == 0) { + atomicMax(max_voxel, cache_max + 1); + } } template @@ -451,7 +491,7 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices, template __global__ void GroupIndexs(const int n, - const int kernel_size, + const int offset, const IntT* indexs, int* index_counts, int* index_groups) { @@ -460,7 +500,7 @@ __global__ void GroupIndexs(const int n, // kernel_size at most int j = atomicAdd(index_counts + index, 1); // nnz * kernel_size - index_groups[index * kernel_size + j] = i; + index_groups[index * offset + j] = i; } } @@ -468,7 +508,7 @@ __global__ void GroupIndexs(const int n, template __global__ void GroupIndexsV2(const int rulebook_len, const int non_zero_num, - const int kernel_size, + const int offset, const int half_kernel_offset, const IntT* indexs, int* index_counts, @@ -479,11 +519,11 @@ __global__ void GroupIndexsV2(const int rulebook_len, i < half_kernel_offset ? index_counts : index_counts + non_zero_num; int* groups_ptr = i < half_kernel_offset ? index_groups - : index_groups + non_zero_num * kernel_size; + : index_groups + non_zero_num * offset; // conflict kernel_size times at most int j = atomicAdd(counts_ptr + index, 1); // nnz * kernel_size - groups_ptr[index * kernel_size + j] = i; + groups_ptr[index * offset + j] = i; } } @@ -582,6 +622,10 @@ int ProductRuleBook(const Context& dev_ctx, DenseTensor out_index_table = phi::Empty(dev_ctx, {table_size}); int* out_index_table_ptr = out_index_table.data(); + DenseTensor out_index_table2 = phi::Empty(dev_ctx, {table_size + 1}); + int* out_index_table2_ptr = out_index_table2.data(); + int* h_max_voxel = h_counter + kernel_size; + if (subm) { DenseTensor tmp_rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta)); IntT* rulebook_ptr = tmp_rulebook.data(); @@ -594,14 +638,29 @@ int ProductRuleBook(const Context& dev_ctx, phi::backends::gpu::GpuMemsetAsync( out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream()); + phi::backends::gpu::GpuMemsetAsync(out_index_table2_ptr, + 0, + sizeof(int) * (table_size + 1), + dev_ctx.stream()); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); - GetOutIndexTable<<>>( - out_indices.data(), non_zero_num, d_x_dims, out_index_table_ptr); + GetOutIndexTable + <<>>(out_indices.data(), + non_zero_num, + d_x_dims, + out_index_table_ptr, + out_index_table2_ptr, + out_index_table2_ptr + table_size); + phi::backends::gpu::GpuMemcpyAsync(h_max_voxel, + out_index_table2_ptr + table_size, + sizeof(int), + gpuMemcpyDeviceToHost, + dev_ctx.stream()); + dev_ctx.Wait(); size_t cache_size = kernel_size * 2 + kernel_size * config.thread_per_block.x * 2 * @@ -655,6 +714,22 @@ int ProductRuleBook(const Context& dev_ctx, out_rulebook_ptr); *rulebook = out_rulebook; + unique_value->ResizeAndAllocate( + {static_cast(non_zero_num * h_max_voxel[0] * kernel_size)}); + int* unique_value_ptr = unique_value->data(); + out_index->ResizeAndAllocate({static_cast(rulebook_len)}); + int* out_index_ptr = out_index->data(); + phi::backends::gpu::GpuMemsetAsync( + out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream()); + GroupIndexs<<>>(rulebook_len, + kernel_size * h_max_voxel[0], + out_rulebook_ptr + rulebook_len, + out_index_ptr, + unique_value_ptr); + return rulebook_len; } else { @@ -729,17 +804,35 @@ int ProductRuleBook(const Context& dev_ctx, IntT* out_indices_ptr = out_indices.data(); + phi::backends::gpu::GpuMemsetAsync( + out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream()); + phi::backends::gpu::GpuMemsetAsync(out_index_table2_ptr, + 0, + sizeof(int) * (table_size + 1), + dev_ctx.stream()); + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1); - GetOutIndexTable<<>>(out_index_ptr, - out_nnz, - d_out_dims, - out_index_table_ptr, - out_indices_ptr); + GetOutIndexTable + <<>>(out_index_ptr, + out_nnz, + d_out_dims, + out_index_table_ptr, + out_index_table2_ptr, + out_index_table2_ptr + table_size, + out_indices_ptr); + phi::backends::gpu::GpuMemcpyAsync(h_max_voxel, + out_index_table2_ptr + table_size, + sizeof(int), + gpuMemcpyDeviceToHost, + dev_ctx.stream()); + dev_ctx.Wait(); + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); - unique_value->ResizeAndAllocate({static_cast(out_nnz * kernel_size)}); + unique_value->ResizeAndAllocate( + {static_cast(out_nnz * h_max_voxel[0] * kernel_size)}); int* unique_value_ptr = unique_value->data(); GroupIndexs<<>>(out_index_table_ptr, rulebook_len, - kernel_size, + kernel_size * h_max_voxel[0], rulebook_ptr + rulebook_len, out_index_ptr, unique_value_ptr); diff --git a/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu index 848517aae2549..9cbd75ed4ea99 100644 --- a/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu @@ -124,10 +124,44 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx, } } + int max_voxel = counter_ptr[kernel_size]; + if (!subm) { + const auto& x_dims = x.dims(); + Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]); + int64_t table_size = 1; + for (int i = 0; i < x_dims.size() - 1; i++) { + table_size *= x_dims[i]; + } + DenseTensor in_index_table = phi::Empty(dev_ctx, {table_size + 1}); + int* in_index_table_ptr = in_index_table.data(); + phi::backends::gpu::GpuMemsetAsync(in_index_table_ptr, + 0, + sizeof(int) * (table_size + 1), + dev_ctx.stream()); + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x.nnz(), 1); + GetOutIndexTable + <<>>(x.non_zero_indices().data(), + x.nnz(), + d_x_dims, + nullptr, + in_index_table_ptr, + in_index_table_ptr + table_size); + + phi::backends::gpu::GpuMemcpyAsync(&max_voxel, + in_index_table_ptr + table_size, + sizeof(int), + gpuMemcpyDeviceToHost, + dev_ctx.stream()); + dev_ctx.Wait(); + } + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); DenseTensor unique_value = phi::Empty( - dev_ctx, {static_cast(x_grad->nnz() * kernel_size * 2)}); + dev_ctx, {static_cast(x_grad->nnz() * max_voxel * kernel_size * 2)}); DenseTensor out_index = phi::Empty(dev_ctx, {static_cast(x.nnz() * 2)}); int* out_index_ptr = out_index.data(); @@ -140,7 +174,7 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx, 0, dev_ctx.stream()>>>(rulebook_len, x.nnz(), - kernel_size, + kernel_size * max_voxel, offsets[kernel_size / 2], rulebook_ptr, out_index_ptr, @@ -152,6 +186,7 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx, unique_value_ptr, x.nnz(), kernel_size, + max_voxel, in_channels, 2, in_features_ptr); @@ -212,6 +247,7 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx, unique_value.data(), x_grad->nnz(), kernel_size, + max_voxel, in_channels, 2, x_grad_values_ptr); @@ -233,7 +269,7 @@ void Conv3dCooGradKernel(const Context& dev_ctx, const std::string& key, SparseCooTensor* x_grad, DenseTensor* kernel_grad) { - PD_VISIT_INTEGRAL_TYPES( + PD_VISIT_BASE_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "Conv3dCooGradGPUKernel", ([&] { Conv3dCooGradGPUKernel(dev_ctx, x, diff --git a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu index 543f3884edcb4..1a2b3134657e4 100644 --- a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu @@ -66,7 +66,7 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx, const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; DenseTensor h_counter, h_offsets; - h_counter.Resize({kernel_size}); + h_counter.Resize({kernel_size + 1}); h_offsets.Resize({kernel_size + 1}); int* h_counter_ptr = dev_ctx.template HostAlloc(&h_counter); int* h_offsets_ptr = dev_ctx.template HostAlloc(&h_offsets); @@ -74,7 +74,7 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx, // Second algorithm: // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf // 1. product rulebook - DenseTensor counter_per_kernel = phi::Empty(dev_ctx, {kernel_size}); + DenseTensor counter_per_kernel = phi::Empty(dev_ctx, {kernel_size + 1}); DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, {kernel_size}); DenseTensor out_index = phi::Empty(dev_ctx, {1}); DenseTensor unique_value = phi::Empty(dev_ctx, {1}); @@ -143,26 +143,6 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx, T* out_values_ptr = out_values->data(); set_zero(dev_ctx, out_values, static_cast(0.0f)); - if (subm) { - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1); - unique_value.ResizeAndAllocate( - {static_cast(out->nnz() * kernel_size)}); - out_index.ResizeAndAllocate({static_cast(rulebook_len)}); - int* out_index_ptr = out_index.data(); - int* unique_value_ptr = unique_value.data(); - phi::backends::gpu::GpuMemsetAsync( - out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream()); - GroupIndexs<<>>(rulebook_len, - kernel_size, - rulebook_ptr + rulebook_len, - out_index_ptr, - unique_value_ptr); - } - const T* kernel_ptr = kernel.data(); for (int i = 0; i < kernel_size; i++) { if (h_counter_ptr[i] <= 0) { @@ -196,6 +176,7 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx, unique_value.data(), out->nnz(), kernel_size, + h_counter_ptr[kernel_size], out_channels, 1, out_values_ptr); @@ -221,7 +202,7 @@ void Conv3dCooKernel(const Context& dev_ctx, SparseCooTensor* out, DenseTensor* rulebook, DenseTensor* counter) { - PD_VISIT_INTEGRAL_TYPES( + PD_VISIT_BASE_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "Conv3dCooGPUKernel", ([&] { Conv3dCooGPUKernel(dev_ctx, x, diff --git a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu index 35d63b7630930..5b182637883db 100644 --- a/paddle/phi/kernels/sparse/gpu/mask_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu @@ -111,7 +111,7 @@ void SparseMaskKernel(const Context& dev_ctx, const DenseTensor& x, const SparseCooTensor& mask, SparseCooTensor* out) { - PD_VISIT_INTEGRAL_TYPES( + PD_VISIT_BASE_INTEGRAL_TYPES( mask.non_zero_indices().dtype(), "SparseMaskGPUKernel", ([&] { SparseMaskGPUKernel(dev_ctx, x, mask, out); })); @@ -270,7 +270,7 @@ void SparseMaskHelperKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& mask_indices, DenseTensor* out) { - PD_VISIT_INTEGRAL_TYPES( + PD_VISIT_BASE_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "SparseMaskHelperGPUKernel", ([&] { SparseMaskHelperGPUKernel(dev_ctx, x, mask_indices, out); })); diff --git a/paddle/phi/kernels/sparse/gpu/mv_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/mv_grad_kernel.cu index 26e37556d34ce..2344325d9515f 100644 --- a/paddle/phi/kernels/sparse/gpu/mv_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/mv_grad_kernel.cu @@ -70,7 +70,7 @@ void MvCooGradKernel(const Context &dev_ctx, // InferMeta of SparseCooTensor 'dx', CreateLikeInferMeta EmptyLikeCooKernel(dev_ctx, x, dx); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, dx->nnz()); - PD_VISIT_INTEGRAL_TYPES( + PD_VISIT_BASE_INTEGRAL_TYPES( dx->non_zero_indices().dtype(), "MvCooGradKernel", ([&] { MvCooGradGpuKernel <<dims()[1]; auto config = phi::backends::gpu::GetGpuLaunchConfig2D( dev_ctx, col_number, row_number); - PD_VISIT_INTEGRAL_TYPES( + PD_VISIT_BASE_INTEGRAL_TYPES( dx->non_zero_crows().dtype(), "MvCsrGradKernel", ([&] { MvCsrGradGpuKernel <<& kernel_sizes, SparseCooTensor* x_grad) { - PD_VISIT_INTEGRAL_TYPES( + PD_VISIT_BASE_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "MaxPoolCooGradGPUKernel", ([&] { MaxPoolCooGradGPUKernel( dev_ctx, x, rulebook, counter, out, out_grad, kernel_sizes, x_grad); diff --git a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu index 8b1888e0a64cf..2480a905c2177 100644 --- a/paddle/phi/kernels/sparse/gpu/pool_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/pool_kernel.cu @@ -139,7 +139,7 @@ void MaxPoolCooKernel(const Context& dev_ctx, SparseCooTensor* out, DenseTensor* rulebook, DenseTensor* counter) { - PD_VISIT_INTEGRAL_TYPES( + PD_VISIT_BASE_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "MaxPoolCooGPUKernel", ([&] { MaxPoolCooGPUKernel(dev_ctx, x, diff --git a/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu index 14b9ec9a37619..33165e29359c4 100644 --- a/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu @@ -92,7 +92,7 @@ void SoftmaxCsrGradKernel(const Context& dev_ctx, dim3 grid((total_row_number + 3) / 4); dim3 block(32, 4); - PD_VISIT_INTEGRAL_TYPES( + PD_VISIT_BASE_INTEGRAL_TYPES( out.non_zero_crows().dtype(), "SoftmaxCsrGradKernel", ([&] { SoftmaxGradGpuKernel<<>>( out.non_zero_crows().data(), diff --git a/paddle/phi/kernels/sparse/gpu/softmax_kernel.cu b/paddle/phi/kernels/sparse/gpu/softmax_kernel.cu index ee0671b333f81..05f200f9b02c0 100644 --- a/paddle/phi/kernels/sparse/gpu/softmax_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/softmax_kernel.cu @@ -105,15 +105,15 @@ void SoftmaxCsrKernel(const Context& dev_ctx, dim3 grid((total_row_number + 3) / 4); dim3 block(32, 4); - PD_VISIT_INTEGRAL_TYPES(x.non_zero_crows().dtype(), "CsrSoftmaxKernel", ([&] { - SoftmaxGpuKernel - <<>>( - x.non_zero_crows().data(), - x.non_zero_elements().data(), - out->mutable_non_zero_elements()->data(), - row_number, - total_row_number); - })); + PD_VISIT_BASE_INTEGRAL_TYPES( + x.non_zero_crows().dtype(), "CsrSoftmaxKernel", ([&] { + SoftmaxGpuKernel<<>>( + x.non_zero_crows().data(), + x.non_zero_elements().data(), + out->mutable_non_zero_elements()->data(), + row_number, + total_row_number); + })); } } // namespace sparse diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu index f569072e319d5..97221c94892b5 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu @@ -277,7 +277,7 @@ template void SparseCsrToCooKernel(const Context& dev_ctx, const SparseCsrTensor& x, SparseCooTensor* out) { - PD_VISIT_INTEGRAL_TYPES( + PD_VISIT_BASE_INTEGRAL_TYPES( x.non_zero_crows().dtype(), "SparseCsrToCooGPUKernel", ([&] { SparseCsrToCooGPUKernel(dev_ctx, x, out); })); @@ -421,7 +421,7 @@ template void SparseCooToCsrKernel(const Context& dev_ctx, const SparseCooTensor& x, SparseCsrTensor* out) { - PD_VISIT_INTEGRAL_TYPES( + PD_VISIT_BASE_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "SparseCooToCsrGPUKernel", ([&] { SparseCooToCsrGPUKernel(dev_ctx, x, out); })); @@ -510,7 +510,7 @@ template void SparseCooToDenseKernel(const Context& dev_ctx, const SparseCooTensor& x, DenseTensor* out) { - PD_VISIT_INTEGRAL_TYPES( + PD_VISIT_BASE_INTEGRAL_TYPES( x.non_zero_indices().dtype(), "SparseCooToDenseGPUKernel", ([&] { SparseCooToDenseGPUKernel(dev_ctx, x, out); })); diff --git a/paddle/phi/kernels/unpool_grad_kernel.h b/paddle/phi/kernels/unpool_grad_kernel.h new file mode 100644 index 0000000000000..a270d700a1c5b --- /dev/null +++ b/paddle/phi/kernels/unpool_grad_kernel.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void UnpoolGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out, + const DenseTensor& out_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_size, + const std::string& data_format, + DenseTensor* x_grad); + +template +void Unpool3dGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out, + const DenseTensor& out_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_size, + const std::string& data_format, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/unpool_kernel.h b/paddle/phi/kernels/unpool_kernel.h new file mode 100644 index 0000000000000..fb537f27667bd --- /dev/null +++ b/paddle/phi/kernels/unpool_kernel.h @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void UnpoolKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_size, + const std::string& data_format, + DenseTensor* out); + +template +void Unpool3dKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_size, + const std::string& data_format, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/class_center_sample_sig.cc b/paddle/phi/ops/compat/class_center_sample_sig.cc new file mode 100644 index 0000000000000..cfaf2b86436db --- /dev/null +++ b/paddle/phi/ops/compat/class_center_sample_sig.cc @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature ClassCenterSampleOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("class_center_sample", + {"Label"}, + {"num_classes", + "num_samples", + "ring_id", + "rank", + "nranks", + "fix_seed", + "seed"}, + {"RemappedLabel", "SampledLocalClassCenter"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(class_center_sample, + phi::ClassCenterSampleOpArgumentMapping); diff --git a/paddle/phi/ops/compat/einsum_sig.cc b/paddle/phi/ops/compat/einsum_sig.cc index e5aa570985596..1030946980f86 100644 --- a/paddle/phi/ops/compat/einsum_sig.cc +++ b/paddle/phi/ops/compat/einsum_sig.cc @@ -17,14 +17,10 @@ limitations under the License. */ namespace phi { KernelSignature EinsumOpArgumentMapping(const ArgumentMappingContext& ctx) { - if (ctx.OutputSize("XShape") > 0 && ctx.OutputSize("InnerCache") > 0) { - return KernelSignature("einsum_raw", - {"Operands"}, - {"equation"}, - {"Out", "InnerCache", "XShape"}); - } else { - return KernelSignature("einsum", {"Operands"}, {"equation"}, {"Out"}); - } + return KernelSignature("einsum_raw", + {"Operands"}, + {"equation"}, + {"Out", "InnerCache", "XShape"}); } KernelSignature EinsumGradOpArgumentMapping(const ArgumentMappingContext& ctx) { diff --git a/paddle/phi/ops/compat/fill_diagonal_tensor_sig.cc b/paddle/phi/ops/compat/fill_diagonal_tensor_sig.cc new file mode 100644 index 0000000000000..56b3c2ab81a9b --- /dev/null +++ b/paddle/phi/ops/compat/fill_diagonal_tensor_sig.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature FillDiagonalTensorOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "fill_diagonal_tensor", {"X", "Y"}, {"offset", "dim1", "dim2"}, {"Out"}); +} + +KernelSignature FillDiagonalTensorGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("fill_diagonal_tensor_grad", + {"Out@GRAD"}, + {"offset", "dim1", "dim2"}, + {"X@GRAD"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(fill_diagonal_tensor, + phi::FillDiagonalTensorOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fill_diagonal_tensor_grad, + phi::FillDiagonalTensorGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/fill_sig.cc b/paddle/phi/ops/compat/fill_sig.cc new file mode 100644 index 0000000000000..2af8fcbea49ca --- /dev/null +++ b/paddle/phi/ops/compat/fill_sig.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { +KernelSignature FillOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("fill", {"X"}, {"value_float"}, {"Out"}); +} + +KernelSignature FillGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "fill_grad", {"Out@GRAD"}, {"value_float"}, {"X@GRAD"}); +} + +} // namespace phi + +PD_REGISTER_BASE_KERNEL_NAME(fill_any, fill); +PD_REGISTER_BASE_KERNEL_NAME(fill_any_grad, fill_grad); + +PD_REGISTER_ARG_MAPPING_FN(fill_any, phi::FillOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(fill_any_grad, phi::FillGradOpArgumentMapping); diff --git a/paddle/fluid/operators/optimizers/lamb_op.cu b/paddle/phi/ops/compat/fold_sig.cc similarity index 53% rename from paddle/fluid/operators/optimizers/lamb_op.cu rename to paddle/phi/ops/compat/fold_sig.cc index 0d60979eef0bd..ed8ac084ba0c2 100644 --- a/paddle/fluid/operators/optimizers/lamb_op.cu +++ b/paddle/phi/ops/compat/fold_sig.cc @@ -1,22 +1,26 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/optimizers/lamb_op.h" +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature FoldGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "fold_grad", + {"X", "Y@GRAD"}, + {"output_sizes", "kernel_sizes", "strides", "paddings", "dilations"}, + {"X@GRAD"}); +} + +} // namespace phi -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - lamb, - ops::LambOpKernel, - ops::LambOpKernel, - ops::LambOpKernel); +PD_REGISTER_ARG_MAPPING_FN(fold_grad, phi::FoldGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/graph_send_recv_sig.cc b/paddle/phi/ops/compat/graph_send_recv_sig.cc index 9df2cf4d0fe91..c8c15619d5d39 100644 --- a/paddle/phi/ops/compat/graph_send_recv_sig.cc +++ b/paddle/phi/ops/compat/graph_send_recv_sig.cc @@ -18,10 +18,17 @@ namespace phi { KernelSignature GraphSendRecvOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("graph_send_recv", - {"X", "Src_index", "Dst_index"}, - {"pool_type", "out_size"}, - {"Out", "Dst_count"}); + if (ctx.HasInput("Out_size")) { + return KernelSignature("graph_send_recv", + {"X", "Src_index", "Dst_index"}, + {"pool_type", "Out_size"}, + {"Out", "Dst_count"}); + } else { + return KernelSignature("graph_send_recv", + {"X", "Src_index", "Dst_index"}, + {"pool_type", "out_size"}, + {"Out", "Dst_count"}); + } } KernelSignature GraphSendRecvGradOpArgumentMapping( diff --git a/paddle/phi/ops/compat/lamb_sig.cc b/paddle/phi/ops/compat/lamb_sig.cc new file mode 100644 index 0000000000000..a59ae6155c183 --- /dev/null +++ b/paddle/phi/ops/compat/lamb_sig.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include + +#include "paddle/phi/core/compat/op_utils.h" +#include "paddle/utils/small_vector.h" + +namespace phi { + +KernelSignature LambOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector in_names = {"Param", + "Grad", + "LearningRate", + "Moment1", + "Moment2", + "Beta1Pow", + "Beta2Pow", + "MasterParam", + "SkipUpdate"}; + paddle::small_vector out_names = {"ParamOut", + "Moment1Out", + "Moment2Out", + "Beta1PowOut", + "Beta2PowOut", + "MasterParamOut"}; + paddle::small_vector attr_names; + + attr_names.emplace_back("weight_decay"); + attr_names.emplace_back("beta1"); + attr_names.emplace_back("beta2"); + attr_names.emplace_back("epsilon"); + attr_names.emplace_back("multi_precision"); + + if (ctx.IsSelectedRowsInput("Grad")) { + return KernelSignature("lamb_sr", + std::move(in_names), + std::move(attr_names), + std::move(out_names)); + } else if (ctx.IsDenseTensorInput("Grad")) { + return KernelSignature("lamb", + std::move(in_names), + std::move(attr_names), + std::move(out_names)); + } else { + return KernelSignature("unregistered", {}, {}, {}); + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(lamb, phi::LambOpArgumentMapping); diff --git a/paddle/phi/ops/compat/margin_cross_entropy_sig.cc b/paddle/phi/ops/compat/margin_cross_entropy_sig.cc new file mode 100644 index 0000000000000..adc0e426d1952 --- /dev/null +++ b/paddle/phi/ops/compat/margin_cross_entropy_sig.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature MarginCrossEntropyOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("margin_cross_entropy", + {"Logits", "Label"}, + {"return_softmax", + "ring_id", + "rank", + "nranks", + "margin1", + "margin2", + "margin3", + "scale"}, + {"Softmax", "Loss"}); +} + +KernelSignature MarginCrossEntropyGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("margin_cross_entropy_grad", + {"Logits", "Label", "Softmax", "Loss@GRAD"}, + {"return_softmax", + "ring_id", + "rank", + "nranks", + "margin1", + "margin2", + "margin3", + "scale"}, + {"Logits@GRAD"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(margin_cross_entropy, + phi::MarginCrossEntropyOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(margin_cross_entropy_grad, + phi::MarginCrossEntropyGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/squeeze_sig.cc b/paddle/phi/ops/compat/squeeze_sig.cc index a251b9f537ccf..4ca45903acfa0 100644 --- a/paddle/phi/ops/compat/squeeze_sig.cc +++ b/paddle/phi/ops/compat/squeeze_sig.cc @@ -18,12 +18,8 @@ namespace phi { KernelSignature SqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) { - if (ctx.HasOutput("XShape")) { - return KernelSignature( - "squeeze_with_xshape", {"X"}, {"axes"}, {"Out", "XShape"}); - } else { - return KernelSignature("squeeze", {"X"}, {"axes"}, {"Out"}); - } + return KernelSignature( + "squeeze_with_xshape", {"X"}, {"axes"}, {"Out", "XShape"}); } KernelSignature SqueezeGradOpArgumentMapping( diff --git a/paddle/phi/ops/compat/unpool3d_sig.cc b/paddle/phi/ops/compat/unpool3d_sig.cc new file mode 100644 index 0000000000000..c73aca837d57b --- /dev/null +++ b/paddle/phi/ops/compat/unpool3d_sig.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" +namespace phi { + +KernelSignature Unpool3dOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "unpool3d", + {"X", "Indices"}, + {"ksize", "strides", "paddings", "output_size", "data_format"}, + {"Out"}); +} + +KernelSignature Unpool3dGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "unpool3d_grad", + {"X", "Indices", "Out", "Out@GRAD"}, + {"ksize", "strides", "paddings", "output_size", "data_format"}, + {"X@GRAD"}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(unpool3d, phi::Unpool3dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(unpool3d_grad, phi::Unpool3dGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/unpool_sig.cc b/paddle/phi/ops/compat/unpool_sig.cc new file mode 100644 index 0000000000000..fb751b4b4b652 --- /dev/null +++ b/paddle/phi/ops/compat/unpool_sig.cc @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" +namespace phi { + +KernelSignature UnpoolOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "unpool", + {"X", "Indices"}, + {"ksize", "strides", "paddings", "output_size", "data_format"}, + {"Out"}); +} + +KernelSignature UnpoolGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "unpool_grad", + {"X", "Indices", "Out", "Out@GRAD"}, + {"ksize", "strides", "paddings", "output_size", "data_format"}, + {"X@GRAD"}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(unpool, phi::UnpoolOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(unpool_grad, phi::UnpoolGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/unsqueeze_sig.cc b/paddle/phi/ops/compat/unsqueeze_sig.cc index a2f184e7150b8..568097298b7ac 100644 --- a/paddle/phi/ops/compat/unsqueeze_sig.cc +++ b/paddle/phi/ops/compat/unsqueeze_sig.cc @@ -18,33 +18,18 @@ namespace phi { KernelSignature UnsqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) { - if (ctx.HasOutput("XShape")) { - if (ctx.InputSize("AxesTensorList") > 0) { - VLOG(2) << "unsqueeze2 in AxesTensorList"; - return KernelSignature("unsqueeze_with_xshape", - {"X"}, - {"AxesTensorList"}, - {"Out", "XShape"}); - } else if (ctx.InputSize("AxesTensor") > 0) { - VLOG(2) << "unsqueeze2 in AxesTensor"; - return KernelSignature( - "unsqueeze_with_xshape", {"X"}, {"AxesTensor"}, {"Out", "XShape"}); - } else { - VLOG(2) << "unsqueeze2 in axes"; - return KernelSignature( - "unsqueeze_with_xshape", {"X"}, {"axes"}, {"Out", "XShape"}); - } + if (ctx.InputSize("AxesTensorList") > 0) { + VLOG(2) << "unsqueeze2 in AxesTensorList"; + return KernelSignature( + "unsqueeze_with_xshape", {"X"}, {"AxesTensorList"}, {"Out", "XShape"}); + } else if (ctx.InputSize("AxesTensor") > 0) { + VLOG(2) << "unsqueeze2 in AxesTensor"; + return KernelSignature( + "unsqueeze_with_xshape", {"X"}, {"AxesTensor"}, {"Out", "XShape"}); } else { - if (ctx.InputSize("AxesTensorList") > 0) { - VLOG(2) << "unsqueeze2 in AxesTensorList"; - return KernelSignature("unsqueeze", {"X"}, {"AxesTensorList"}, {"Out"}); - } else if (ctx.InputSize("AxesTensor") > 0) { - VLOG(2) << "unsqueeze2 in AxesTensor"; - return KernelSignature("unsqueeze", {"X"}, {"AxesTensor"}, {"Out"}); - } else { - VLOG(2) << "unsqueeze2 in axes"; - return KernelSignature("unsqueeze", {"X"}, {"axes"}, {"Out"}); - } + VLOG(2) << "unsqueeze2 in axes"; + return KernelSignature( + "unsqueeze_with_xshape", {"X"}, {"axes"}, {"Out", "XShape"}); } } diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 2e4dc53290226..1c013e1ca3494 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -78,6 +78,7 @@ import paddle.reader # noqa: F401 import paddle.static # noqa: F401 import paddle.vision # noqa: F401 +import paddle.geometric # noqa: F401 from .tensor.attribute import is_complex # noqa: F401 from .tensor.attribute import is_integer # noqa: F401 diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py index f9b5b9a5323bc..18ceb79ea8f73 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py @@ -1246,6 +1246,108 @@ def __init__(self, name): self._forward_implemented = True self._backward_implemented = True + def calc_cost(self, op_role, dist_op, ctx, cluster): + cost = None + if int(op_role) == int(OpRole.Forward): + cost = self.calc_fwd_cost(dist_op, ctx, cluster) + elif int(op_role) == int(OpRole.Backward): + cost = self.calc_bwd_cost(dist_op, ctx, cluster) + assert cost is not None + return cost + + def calc_bwd_cost(self, dist_op, ctx, cluster): + # by now the backward function only insert the gradient allreduce for dist op itself + res = [] + backward_op = dist_op.serial_op + dist_attr = dist_op.dist_attr + main_block = backward_op.block + vars = main_block.vars + Y_var_dim_mapping = dist_attr.get_input_dims_mapping( + backward_op.input("Y")[0]) + process_mesh = dist_attr.process_mesh + processes = process_mesh.processes + # col parallel: matmul + allreduce + assert Y_var_dim_mapping[0] < 0 + parallel_axis = Y_var_dim_mapping[1] + + has_x_grad = len(backward_op.output("X@GRAD")) > 0 + if has_x_grad: + assert len(backward_op.output("X@GRAD")) == 1 + + # calc comp op cost + desc_mapping = build_comp_desc_from_dist_op(dist_op=dist_op, + dist_context=ctx) + + cost_mapping = build_comp_costs_from_descs(MatmulV2GradOpCost, ctx, + processes, desc_mapping, + cluster) + res.append(cost_mapping) + + # calc comm op cost + if has_x_grad: + attrs = {"use_calc_stream": True, "use_model_parallel": True} + var_names = backward_op.output("X@GRAD") + c_allreduce_sum_desc_mapping = build_comm_desc_from_dist_op( + "c_allreduce_sum", + dist_op, + ctx, + var_names, + attrs=attrs, + parallel_axis=parallel_axis) + comm_op_cost_list = build_comm_costs_from_descs( + AllreduceSumOpCost, ctx, processes, + c_allreduce_sum_desc_mapping, cluster) + res.append(comm_op_cost_list) + + # need gradient allreduce + process_mesh = dist_attr.process_mesh + var_dim_mapping = dist_attr.get_input_dims_mapping( + backward_op.input("X")[0]) + mesh_shape = process_mesh.topology + batch_size_axis = var_dim_mapping[0] + if batch_size_axis > -1 and mesh_shape[ + batch_size_axis] > 1 and is_parameter_related( + backward_op.input("Y")[0], main_block): + parallel_axis = batch_size_axis + attrs = {"use_calc_stream": True} + var_names = [backward_op.output('Y@GRAD')[0]] + build_dp_costs(res, dist_op, ctx, var_names, attrs, parallel_axis, + cluster) + return res + + def calc_fwd_cost(self, dist_op, ctx, cluster): + # calc comp op cost + # TODO: trans shape if trans_x or trans_y is True + comp_desc_mapping = build_comp_desc_from_dist_op(dist_op=dist_op, + dist_context=ctx) + processes = dist_op.dist_attr.process_mesh.processes + comp_cost_mapping = build_comp_costs_from_descs(MatmulV2OpCost, ctx, + processes, + comp_desc_mapping, + cluster) + + # calc comm op cost + serial_op = dist_op.serial_op + vars = serial_op.block.vars + + parallel_axis = dist_op.dist_attr.get_input_dims_mapping( + serial_op.input("Y")[0])[-1] + attrs = {"use_calc_stream": True, "use_model_parallel": True} + + var_names = serial_op.input("X") + c_identity_desc_mapping = build_comm_desc_from_dist_op( + "c_identity", + dist_op, + ctx, + var_names, + attrs=attrs, + parallel_axis=parallel_axis) + comm_op_cost_list = build_comm_costs_from_descs( + IdentityOpCost, ctx, processes, c_identity_desc_mapping, cluster) + + res_cost = [comm_op_cost_list, comp_cost_mapping] + return res_cost + def is_input_compatible(self, dist_op): op_desc = dist_op.serial_op.desc op_dist_attr = dist_op.dist_attr @@ -1468,6 +1570,100 @@ def __init__(self, name): self._forward_implemented = True self._backward_implemented = True + def calc_cost(self, op_role, dist_op, ctx, cluster): + cost = None + if int(op_role) == int(OpRole.Forward): + cost = self.calc_fwd_cost(dist_op, ctx, cluster) + elif int(op_role) == int(OpRole.Backward): + cost = self.calc_bwd_cost(dist_op, ctx, cluster) + assert cost is not None + return cost + + def calc_bwd_cost(self, dist_op, ctx, cluster): + # by now the backward function only insert the gradient allreduce for dist op itself + res = [] + backward_op = dist_op.serial_op + dist_attr = dist_op.dist_attr + main_block = backward_op.block + vars = main_block.vars + Y_var_dim_mapping = dist_attr.get_input_dims_mapping( + backward_op.input("Y")[0]) + assert Y_var_dim_mapping[1] < 0 + parallel_axis = Y_var_dim_mapping[0] + + process_mesh = dist_attr.process_mesh + processes = process_mesh.processes + # calc comm op cost + var_names = [backward_op.input("Out@GRAD")[0]] + attrs = {"use_calc_stream": True, "use_model_parallel": True} + c_identity_desc_mapping = build_comm_desc_from_dist_op( + "c_identity", + dist_op, + ctx, + var_names, + attrs=attrs, + parallel_axis=parallel_axis) + comm_op_cost_list = build_comm_costs_from_descs( + IdentityOpCost, ctx, processes, c_identity_desc_mapping, cluster) + res.append(comm_op_cost_list) + + # calc comp op cost + desc_mapping = build_comp_desc_from_dist_op(dist_op=dist_op, + dist_context=ctx) + cost_mapping = build_comp_costs_from_descs(MatmulV2GradOpCost, ctx, + processes, desc_mapping, + cluster) + res.append(cost_mapping) + + # need gradient allreduce + process_mesh = dist_attr.process_mesh + var_dim_mapping = dist_attr.get_input_dims_mapping( + backward_op.input("X")[0]) + mesh_shape = process_mesh.topology + batch_size_axis = var_dim_mapping[0] + if batch_size_axis > -1 and mesh_shape[ + batch_size_axis] > 1 and is_parameter_related( + backward_op.input("Y")[0], main_block): + parallel_axis = batch_size_axis + attrs = {"use_calc_stream": True} + var_names = [backward_op.output('Y@GRAD')[0]] + build_dp_costs(res, dist_op, ctx, var_names, attrs, parallel_axis, + cluster) + return res + + def calc_fwd_cost(self, dist_op, ctx, cluster): + # calc comp op cost + desc_mapping = build_comp_desc_from_dist_op(dist_op=dist_op, + dist_context=ctx) + processes = dist_op.dist_attr.process_mesh.processes + cost_mapping = build_comp_costs_from_descs(MatmulV2OpCost, ctx, + processes, desc_mapping, + cluster) + + # calc comm op cost + serial_op = dist_op.serial_op + vars = serial_op.block.vars + + parallel_axis = dist_op.dist_attr.get_input_dims_mapping( + serial_op.input("Y")[0])[-2] + attrs = {"use_calc_stream": True, "use_model_parallel": True} + + var_names = serial_op.output("Out") + c_allreduce_sum_desc_mapping = build_comm_desc_from_dist_op( + "c_allreduce_sum", + dist_op, + ctx, + var_names, + attrs=attrs, + parallel_axis=parallel_axis) + + comm_op_cost_list = build_comm_costs_from_descs( + AllreduceSumOpCost, ctx, processes, c_allreduce_sum_desc_mapping, + cluster) + res_cost = [cost_mapping, comm_op_cost_list] + + return res_cost + def is_input_compatible(self, dist_op): op_desc = dist_op.serial_op.desc op_dist_attr = dist_op.dist_attr @@ -1677,6 +1873,61 @@ class DistributedMatmulV2Impl2(DistributedOperatorImpl): def __init__(self, name): super(DistributedMatmulV2Impl2, self).__init__(name) + def calc_cost(self, op_role, dist_op, ctx, cluster): + cost = None + if int(op_role) == int(OpRole.Forward): + cost = self.calc_fwd_cost(dist_op, ctx, cluster) + elif int(op_role) == int(OpRole.Backward): + cost = self.calc_bwd_cost(dist_op, ctx, cluster) + assert cost is not None + return cost + + def calc_bwd_cost(self, dist_op, ctx, cluster): + res = [] + backward_op = dist_op.serial_op + dist_attr = dist_op.dist_attr + main_block = backward_op.block + vars = main_block.vars + process_mesh = dist_attr.process_mesh + + # calc comp op cost + desc_mapping = build_comp_desc_from_dist_op(dist_op=dist_op, + dist_context=ctx) + processes = process_mesh.processes + cost_mapping = build_comp_costs_from_descs(MatmulV2GradOpCost, ctx, + processes, desc_mapping, + cluster) + res.append(cost_mapping) + + # need gradient allreduce + var_dim_mapping = dist_attr.get_input_dims_mapping( + backward_op.input("X")[0]) + mesh_shape = process_mesh.topology + batch_size_axis = var_dim_mapping[0] + if batch_size_axis > -1 and mesh_shape[ + batch_size_axis] > 1 and is_parameter_related( + backward_op.input("Y")[0], main_block): + parallel_axis = batch_size_axis + attrs = {"use_calc_stream": True} + var_names = [backward_op.output('Y@GRAD')[0]] + build_dp_costs(res, dist_op, ctx, var_names, attrs, parallel_axis, + cluster) + + return res + + def calc_fwd_cost(self, dist_op, ctx, cluster): + # calc comp op cost + desc_mapping = build_comp_desc_from_dist_op(dist_op=dist_op, + dist_context=ctx) + processes = dist_op.dist_attr.process_mesh.processes + cost_mapping = build_comp_costs_from_descs(MatmulV2OpCost, ctx, + processes, desc_mapping, + cluster) + + res_cost = [cost_mapping] + + return res_cost + def is_input_compatible(self, dist_op): op_desc = dist_op.serial_op.desc op_dist_attr = dist_op.dist_attr @@ -1765,6 +2016,102 @@ def __init__(self, name): self._forward_implemented = True self._backward_implemented = True + def calc_cost(self, op_role, dist_op, ctx, cluster): + cost = None + if int(op_role) == int(OpRole.Forward): + cost = self.calc_fwd_cost(dist_op, ctx, cluster) + elif int(op_role) == int(OpRole.Backward): + cost = self.calc_bwd_cost(dist_op, ctx, cluster) + assert cost is not None + return cost + + def calc_bwd_cost(self, dist_op, ctx, cluster): + # by now the backward function only insert the gradient allreduce for dist op itself + res = [] + backward_op = dist_op.serial_op + dist_attr = dist_op.dist_attr + main_block = backward_op.block + vars = main_block.vars + Y_var_dim_mapping = dist_attr.get_input_dims_mapping( + backward_op.input("Y")[0]) + # col parallel: matmul + allreduce + assert Y_var_dim_mapping[0] < 0 + parallel_axis = Y_var_dim_mapping[1] + + has_x_grad = len(backward_op.output("X@GRAD")) > 0 + if has_x_grad: + assert len(backward_op.output("X@GRAD")) == 1 + + # calc comp op cost + desc_mapping = build_comp_desc_from_dist_op(dist_op=dist_op, + dist_context=ctx) + process_mesh = dist_attr.process_mesh + processes = process_mesh.processes + cost_mapping = build_comp_costs_from_descs(MulGradOpCost, ctx, + processes, desc_mapping, + cluster) + res.append(cost_mapping) + + # calc comm op cost + if has_x_grad: + attrs = {"use_calc_stream": True, "use_model_parallel": True} + var_names = backward_op.output("X@GRAD") + c_allreduce_sum_desc_mapping = build_comm_desc_from_dist_op( + "c_allreduce_sum", + dist_op, + ctx, + var_names, + attrs=attrs, + parallel_axis=parallel_axis) + comm_op_cost_list = build_comm_costs_from_descs( + AllreduceSumOpCost, ctx, processes, + c_allreduce_sum_desc_mapping, cluster) + res.append(comm_op_cost_list) + + # need gradient allreduce + var_dim_mapping = dist_attr.get_input_dims_mapping( + backward_op.input("X")[0]) + mesh_shape = process_mesh.topology + batch_size_axis = var_dim_mapping[0] + if batch_size_axis > -1 and mesh_shape[ + batch_size_axis] > 1 and is_parameter_related( + backward_op.input("Y")[0], main_block): + parallel_axis = batch_size_axis + attrs = {"use_calc_stream": True} + var_names = [backward_op.output('Y@GRAD')[0]] + build_dp_costs(res, dist_op, ctx, var_names, attrs, parallel_axis, + cluster) + return res + + def calc_fwd_cost(self, dist_op, ctx, cluster): + # calc comp op cost + desc_mapping = build_comp_desc_from_dist_op(dist_op=dist_op, + dist_context=ctx) + processes = dist_op.dist_attr.process_mesh.processes + cost_mapping = build_comp_costs_from_descs(MulOpCost, ctx, processes, + desc_mapping, cluster) + + # calc comm op cost + serial_op = dist_op.serial_op + vars = serial_op.block.vars + parallel_axis = dist_op.dist_attr.get_input_dims_mapping( + serial_op.input("Y")[0])[-1] + attrs = {"use_calc_stream": True, "use_model_parallel": True} + var_names = serial_op.input("X") + c_identity_desc_mapping = build_comm_desc_from_dist_op( + "c_identity", + dist_op, + ctx, + var_names, + attrs=attrs, + parallel_axis=parallel_axis) + + comm_op_cost_list = build_comm_costs_from_descs( + IdentityOpCost, ctx, processes, c_identity_desc_mapping, cluster) + res_cost = [comm_op_cost_list, cost_mapping] + + return res_cost + def is_input_compatible(self, dist_op): op_desc = dist_op.serial_op.desc op_dist_attr = dist_op.dist_attr @@ -1916,7 +2263,24 @@ def forward(ctx, *args, **kwargs): "y_num_col_dims": src_op.desc.attr("y_num_col_dims"), OP_ROLE_KEY: src_op.attr('op_role') } - inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]} + inputs = {'X': intermediate_var_0, 'Y': Weight_var} + + inputs_ref_shape = {} + inputs_original_shape = {} + for var_name in inputs: + if var_name == "X": + var = X_var + else: + var = inputs[var_name] + inputs_original_shape[var_name] = var.shape + input_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(var) + input_var_dist_attr = op_dist_attr.get_input_dist_attr(var.name) + input_ref_shape = infer_shape(main_block, var, + input_tensor_dist_attr, + input_var_dist_attr) + inputs_ref_shape[var_name] = input_ref_shape + var.desc.set_shape(input_ref_shape) + mul_op = main_block.append_op(type='mul', inputs=inputs, outputs={'Out': Out_var}, @@ -1924,6 +2288,11 @@ def forward(ctx, *args, **kwargs): if Out_var.shape != ref_shape_out: Out_var.desc.set_shape(ref_shape_out) + for var_name in inputs: + var = inputs[var_name] + original_shape = inputs_original_shape[var_name] + var.desc.set_shape(original_shape) + # set dist op's dist_attr with serial op's dist_attr # c_identity identity_op_dist_attr = OperatorDistributedAttribute() @@ -1988,6 +2357,100 @@ def __init__(self, name): self._forward_implemented = True self._backward_implemented = True + def calc_cost(self, op_role, dist_op, ctx, cluster): + cost = None + if int(op_role) == int(OpRole.Forward): + cost = self.calc_fwd_cost(dist_op, ctx, cluster) + elif int(op_role) == int(OpRole.Backward): + cost = self.calc_bwd_cost(dist_op, ctx, cluster) + assert cost is not None + return cost + + def calc_bwd_cost(self, dist_op, ctx, cluster): + # by now the backward function only insert the gradient allreduce for dist op itself + res = [] + backward_op = dist_op.serial_op + dist_attr = dist_op.dist_attr + process_mesh = dist_attr.process_mesh + main_block = backward_op.block + vars = main_block.vars + Y_var_dim_mapping = dist_attr.get_input_dims_mapping( + backward_op.input("Y")[0]) + assert Y_var_dim_mapping[1] < 0 + parallel_axis = Y_var_dim_mapping[0] + + # calc comm op cost + var_names = [backward_op.input("Out@GRAD")[0]] + attrs = {"use_calc_stream": True, "use_model_parallel": True} + c_identity_desc_mapping = build_comm_desc_from_dist_op( + "c_identity", + dist_op, + ctx, + var_names, + attrs=attrs, + parallel_axis=parallel_axis) + processes = process_mesh.processes + comm_op_cost_list = build_comm_costs_from_descs( + IdentityOpCost, ctx, processes, c_identity_desc_mapping, cluster) + res.append(comm_op_cost_list) + + # calc comp op cost + desc_mapping = build_comp_desc_from_dist_op(dist_op=dist_op, + dist_context=ctx) + cost_mapping = build_comp_costs_from_descs(MulGradOpCost, ctx, + processes, desc_mapping, + cluster) + res.append(cost_mapping) + + # need gradient allreduce + var_dim_mapping = dist_attr.get_input_dims_mapping( + backward_op.input("X")[0]) + mesh_shape = process_mesh.topology + batch_size_axis = var_dim_mapping[0] + if batch_size_axis > -1 and mesh_shape[ + batch_size_axis] > 1 and is_parameter_related( + backward_op.input("Y")[0], main_block): + parallel_axis = batch_size_axis + attrs = {"use_calc_stream": True} + var_names = [backward_op.output('Y@GRAD')[0]] + build_dp_costs(res, dist_op, ctx, var_names, attrs, parallel_axis, + cluster) + return res + + def calc_fwd_cost(self, dist_op, ctx, cluster): + # calc comp op cost + desc_mapping = build_comp_desc_from_dist_op(dist_op=dist_op, + dist_context=ctx) + processes = dist_op.dist_attr.process_mesh.processes + cost_mapping = build_comp_costs_from_descs(MulOpCost, ctx, processes, + desc_mapping, cluster) + + # calc comm op cost + serial_op = dist_op.serial_op + vars = serial_op.block.vars + + parallel_axis = dist_op.dist_attr.get_input_dims_mapping( + serial_op.input("Y")[0])[-2] + attrs = {"use_calc_stream": True, "use_model_parallel": True} + + var_names = serial_op.output("Out") + c_allreduce_sum_desc_mapping = build_comm_desc_from_dist_op( + "c_allreduce_sum", + dist_op, + ctx, + var_names, + attrs=attrs, + parallel_axis=parallel_axis) + + # print("dist_matmul.py dist_op: ", dist_op) + comm_op_cost_list = build_comm_costs_from_descs( + AllreduceSumOpCost, ctx, processes, c_allreduce_sum_desc_mapping, + cluster) + + res_cost = [cost_mapping, comm_op_cost_list] + + return res_cost + def is_input_compatible(self, dist_op): op_desc = dist_op.serial_op.desc op_dist_attr = dist_op.dist_attr @@ -2122,13 +2585,32 @@ def forward(ctx, *args, **kwargs): ctx.set_tensor_dist_attr_for_program(intermediate_var_0, out_var_dist_attr) + inputs_ref_shape = {} + inputs_original_shape = {} + for var_name in inputs: + var = inputs[var_name] + inputs_original_shape[var_name] = var.shape + input_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(var) + input_var_dist_attr = op_dist_attr.get_input_dist_attr(var.name) + input_ref_shape = infer_shape(main_block, var, + input_tensor_dist_attr, + input_var_dist_attr) + inputs_ref_shape[var_name] = input_ref_shape + var.desc.set_shape(input_ref_shape) + mul_op = main_block.append_op(type='mul', inputs=inputs, outputs={'Out': intermediate_var_0}, attrs=attrs) + if intermediate_var_0.shape != ref_shape: intermediate_var_0.desc.set_shape(ref_shape) + for var_name in inputs: + var = inputs[var_name] + original_shape = inputs_original_shape[var_name] + var.desc.set_shape(original_shape) + c_allreduce_sum_op = main_block.append_op( type='c_allreduce_sum', inputs={'X': intermediate_var_0}, @@ -2139,6 +2621,7 @@ def forward(ctx, *args, **kwargs): 'use_model_parallel': True, OP_ROLE_KEY: src_op.attr('op_role') }) + if Out_var.shape != ref_shape: Out_var.desc.set_shape(ref_shape) @@ -2198,6 +2681,59 @@ class DistributedMulImpl2(DistributedOperatorImpl): def __init__(self, name): super(DistributedMulImpl2, self).__init__(name) + def calc_cost(self, op_role, dist_op, ctx, cluster): + cost = None + if int(op_role) == int(OpRole.Forward): + cost = self.calc_fwd_cost(dist_op, ctx, cluster) + elif int(op_role) == int(OpRole.Backward): + cost = self.calc_bwd_cost(dist_op, ctx, cluster) + assert cost is not None + return cost + + def calc_bwd_cost(self, dist_op, ctx, cluster): + res = [] + backward_op = dist_op.serial_op + dist_attr = dist_op.dist_attr + main_block = backward_op.block + vars = main_block.vars + + # calc comp op cost + desc_mapping = build_comp_desc_from_dist_op(dist_op=dist_op, + dist_context=ctx) + process_mesh = dist_attr.process_mesh + processes = process_mesh.processes + cost_mapping = build_comp_costs_from_descs(MulGradOpCost, ctx, + processes, desc_mapping, + cluster) + res.append(cost_mapping) + + # need gradient allreduce + var_dim_mapping = dist_attr.get_input_dims_mapping( + backward_op.input("X")[0]) + mesh_shape = process_mesh.topology + batch_size_axis = var_dim_mapping[0] + if batch_size_axis > -1 and mesh_shape[ + batch_size_axis] > 1 and is_parameter_related( + backward_op.input("Y")[0], main_block): + parallel_axis = batch_size_axis + attrs = {"use_calc_stream": True} + var_names = [backward_op.output('Y@GRAD')[0]] + build_dp_costs(res, dist_op, ctx, var_names, attrs, parallel_axis, + cluster) + + return res + + def calc_fwd_cost(self, dist_op, ctx, cluster): + # calc comp op cost + desc_mapping = build_comp_desc_from_dist_op(dist_op=dist_op, + dist_context=ctx) + processes = dist_op.dist_attr.process_mesh.processes + cost_mapping = build_comp_costs_from_descs(MulOpCost, ctx, processes, + desc_mapping, cluster) + + res_cost = [cost_mapping] + return res_cost + def is_input_compatible(self, dist_op): op_desc = dist_op.serial_op.desc op_dist_attr = dist_op.dist_attr diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 620f23b9e1919..bf36015f89407 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -476,6 +476,10 @@ def new_group(ranks=None, backend=None): place = core.MLUPlace(genv.device_id) core.CNCLParallelContext(strategy, place).init_with_ring_id(ring_id) + elif core.is_compiled_with_xpu(): + place = core.XPUPlace(genv.device_id) + core.BKCLParallelContext(strategy, + place).init_with_ring_id(ring_id) else: assert False, ("no cuda device found") else: diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py index 0cfb946d3d8ca..8ac5b93ef672b 100755 --- a/python/paddle/distributed/fleet/__init__.py +++ b/python/paddle/distributed/fleet/__init__.py @@ -76,7 +76,12 @@ save_inference_model = fleet.save_inference_model save_persistables = fleet.save_persistables save_cache_model = fleet.save_cache_model +check_save_pre_patch_done = fleet.check_save_pre_patch_done +save_one_table = fleet.save_one_table +save_dense_params = fleet.save_dense_params load_model = fleet.load_model +load_inference_model = fleet.load_inference_model +load_one_table = fleet.load_one_table minimize = fleet.minimize distributed_model = fleet.distributed_model step = fleet.step diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 1a9b3f565b77a..52f3812d8a5f5 100755 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -710,10 +710,60 @@ def load_model(self, path, mode): # build net # fleet.distributed_optimizer(...) - fleet.load_model("path", "mode") + fleet.load_model("path", mode=0) """ - self._runtime_handle.load_model(path, mode) + self._runtime_handle._load_persistables(path, mode) + + @is_non_distributed_check + @inited_runtime_handler + def load_one_table(self, table_id, path, mode): + """ + load fleet one table from path + + + Returns: + None + + Examples: + + .. code-block:: python + + import paddle.distributed.fleet as fleet + fleet.init() + + # build net + # fleet.distributed_optimizer(...) + + fleet.load_one_table(0, "path", mode=0) + + """ + self._runtime_handle._load_one_table(table_id, path, mode) + + @is_non_distributed_check + @inited_runtime_handler + def load_inference_model(self, path, mode): + """ + load fleet inference model from path + + + Returns: + None + + Examples: + + .. code-block:: python + + import paddle.distributed.fleet as fleet + fleet.init() + + # build net + # fleet.distributed_optimizer(...) + + fleet.load_inference_model("path", mode=1) + + """ + self._runtime_handle._load_inference_model(path, mode) @is_non_distributed_check @inited_runtime_handler @@ -906,6 +956,70 @@ def save_persistables(self, executor, dirname, main_program=None, mode=0): def save_cache_model(self, dirname, **configs): return self._runtime_handle._save_cache_model(dirname, **configs) + @is_non_distributed_check + @inited_runtime_handler + def check_save_pre_patch_done(self): + return self._runtime_handle._check_save_pre_patch_done() + + @is_non_distributed_check + @inited_runtime_handler + def save_one_table(self, table_id, path, mode): + """ + save fleet one table from path + + + Returns: + None + + Examples: + + .. code-block:: python + + import paddle.distributed.fleet as fleet + fleet.init() + + # build net + # fleet.distributed_optimizer(...) + + fleet.save_one_table(0, "path", mode=0) + + """ + self._runtime_handle._save_one_table(table_id, path, mode) + + @is_non_distributed_check + @inited_runtime_handler + def save_dense_params(self, + executor, + dirname, + scope, + program, + var_names=None): + """ + save fleet one table from path + + + Returns: + None + + Examples: + + .. code-block:: python + + import paddle.distributed.fleet as fleet + fleet.init() + import paddle + place = paddle.fluid.CPUPlace() + exe = paddle.fluid.Executor(place) + + # build net + # fleet.distributed_optimizer(...) + + fleet.save_dense_params(exe, "path", scope=paddle.static.global_scope(), program=paddle.static.default_main_program()) + + """ + self._runtime_handle._save_dense_params(executor, dirname, scope, + program, var_names) + def shrink(self, threshold=None): self._runtime_handle._shrink(threshold) @@ -1856,9 +1970,8 @@ def unscale_method(self, optimizer): group=None) self._found_inf = is_found_inf.numpy()[0] - # Only tensor_parallel and pipeline_parallel need to modify scaler - if self._hcg.get_parallel_mode() in (ParallelMode.TENSOR_PARALLEL, - ParallelMode.PIPELINE_PARALLEL): + # Only data_parallel doesn't need to modify scaler + if self._hcg.get_parallel_mode() is not ParallelMode.DATA_PARALLEL: scaler._unscale = MethodType(unscale_method, scaler) return scaler diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py index 14ca1322e789f..c9d7c71dbbb18 100644 --- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py +++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle +from paddle.fluid import core from paddle.fluid.dygraph.layers import Layer from .random import get_rng_state_tracker from paddle.nn import functional as F @@ -27,6 +28,13 @@ # language models using model parallelism[J]. arXiv preprint arXiv:1909.08053, 2019. (https://arxiv.org/abs/1909.08053) +def is_fused_matmul_bias_supported(): + if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(): + return hasattr(core.ops, 'fused_gemm_epilogue') + else: + return False + + class VocabParallelEmbedding(Layer): def __init__(self, @@ -100,7 +108,8 @@ def __init__(self, weight_attr=None, has_bias=None, gather_output=True, - name=None): + name=None, + fuse_matmul_bias=False): super(ColumnParallelLinear, self).__init__() self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group( @@ -147,6 +156,18 @@ def __init__(self, else: self.bias = None + self.linear = F.linear + + if fuse_matmul_bias: + if not is_fused_matmul_bias_supported(): + raise NotImplementedError( + "You set fuse_matmul_bias=True in ColumnParallelLinear, " + "however, the paddle you are using not support this operation. " + "Please set fuse_matmul_bias=False or use paddle compiled " + "with cuda 11.6 or higher.") + from paddle.incubate.nn.functional import fused_linear + self.linear = fused_linear + def forward(self, x): # use inner api to process identity if self.is_mp: @@ -155,10 +176,10 @@ def forward(self, x): else: input_parallel = x - output_parallel = F.linear(input_parallel, - self.weight, - self.bias, - name=self._name) + output_parallel = self.linear(input_parallel, + self.weight, + self.bias, + name=self._name) if self.gather_output and self.is_mp: output = paddle.distributed.collective._c_concat( @@ -176,7 +197,8 @@ def __init__(self, weight_attr=None, has_bias=True, input_is_parallel=False, - name=None): + name=None, + fuse_matmul_bias=False): super(RowParallelLinear, self).__init__() self.in_features = in_features @@ -225,6 +247,18 @@ def __init__(self, else: self.bias = None + self.linear = F.linear + + if fuse_matmul_bias: + if not is_fused_matmul_bias_supported(): + raise NotImplementedError( + "You set fuse_matmul_bias=True in RowParallelLinear, " + "however, the paddle you are using not support this operation. " + "Please set fuse_matmul_bias=False or use paddle compiled " + "with cuda 11.6 or higher.") + from paddle.incubate.nn.functional import fused_linear + self.linear = fused_linear + def forward(self, x): if self.input_is_parallel or (not self.is_mp): input_parallel = x @@ -233,18 +267,22 @@ def forward(self, x): input_parallel = paddle.distributed.collective._c_split( x, group=self.model_parallel_group) - output_parallel = F.linear(input_parallel, self.weight, name=self._name) - if self.is_mp: + output_parallel = self.linear(input_parallel, + self.weight, + name=self._name) output_ = paddle.distributed.collective._mp_allreduce( output_parallel, group=self.model_parallel_group, use_calc_stream=True, use_model_parallel=True) + output = output_ + self.bias if self.bias is not None else output_ else: - output_ = output_parallel + output = self.linear(input_parallel, + self.weight, + self.bias, + name=self._name) - output = output_ + self.bias if self.bias is not None else output_ return output diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py index fdbf0312db664..55c6a3308b8c1 100644 --- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py +++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py @@ -18,7 +18,7 @@ from paddle import _C_ops from paddle.fluid import core from paddle.fluid.data_feeder import check_variable_and_dtype -from paddle.fluid.framework import _non_static_mode, default_main_program +from paddle.fluid.framework import _non_static_mode, default_main_program, Variable from paddle.fluid.layer_helper import LayerHelper __all__ = [] @@ -187,11 +187,12 @@ def dropout(x, if rng_name is None: return paddle.nn.functional.dropout(x, p, axis, training, mode, name) + if not isinstance(p, (float, int, Variable)): + raise TypeError("p argument should be a number(int|float) or Variable") + # fast return for p == 0 - if p == 0: return x + if isinstance(p, (int, float)) and p == 0: return x - assert isinstance(p, (float, int)), \ - TypeError("p argument should be a number") assert 0 <= p <= 1, ValueError("p argument should between 0 and 1") assert mode in ('downscale_in_infer', 'upscale_in_train'), \ ValueError( @@ -211,6 +212,11 @@ def dropout(x, seed = determinate_seed(rng_name) + if isinstance(p, Variable) and not p.shape != [1]: + raise TypeError( + "Required p.shape == [1] if type(p) is Variable, but received p.shape = {}" + .format(p.shape)) + helper = LayerHelper('dropout', **locals()) check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'dropout') diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py index af38d9f513810..95635154c33aa 100644 --- a/python/paddle/distributed/fleet/utils/fs.py +++ b/python/paddle/distributed/fleet/utils/fs.py @@ -486,6 +486,7 @@ def _run_cmd(self, cmd, redirect_stderr=False, retry_times=5): time.sleep(retry_sleep_second) if ret == 134: raise FSShellCmdAborted(cmd) + return ret, output.splitlines() @_handle_errors() @@ -615,10 +616,12 @@ def is_dir(self, fs_path): def _is_dir(self, fs_path): cmd = "test -d {}".format(fs_path, redirect_stderr=True) - ret, lines = self._run_cmd(cmd) + ret, lines = self._run_cmd(cmd, retry_times=1) if ret: # other error if self._test_match(lines): + print('raise exception: ') + print('\n'.join(lines)) raise ExecuteError(cmd) return False @@ -682,13 +685,10 @@ def is_exist(self, fs_path): client = HDFSClient(hadoop_home, configs) ret = client.is_exist("hdfs:/test_hdfs_client") """ - cmd = "ls {} ".format(fs_path) - ret, out = self._run_cmd(cmd, redirect_stderr=True) + cmd = "test -e {} ".format(fs_path) + ret, out = self._run_cmd(cmd, redirect_stderr=True, retry_times=1) if ret != 0: - for l in out: - if "No such file or directory" in l: - return False - raise ExecuteError(cmd) + return False return True @@ -712,7 +712,7 @@ def upload_dir(self, local_dir, dest_dir, overwrite=False): self._try_upload(local_dir, dest_dir) # can't retry - def upload(self, local_path, fs_path, multi_processes=1, overwrite=False): + def upload(self, local_path, fs_path, multi_processes=5, overwrite=False): """ Upload the local path to remote HDFS. @@ -766,11 +766,7 @@ def get_local_files(path): local = LocalFS() if not local.is_exist(local_path): raise FSFileNotExistsError("{} not exists".format(local_path)) - # upload_dir - if local.is_dir(local_path): - self.upload_dir(local_path, fs_path, overwrite=overwrite) - return - # upload files + all_files = get_local_files(local_path) if not all_files: print("there are nothing need to upload, function exit") @@ -805,7 +801,7 @@ def _try_upload(self, local_path, fs_path): raise e # can't retry - def download(self, fs_path, local_path, multi_processes=1, overwrite=False): + def download(self, fs_path, local_path, multi_processes=5, overwrite=False): """ Download remote HDFS path to the local. @@ -962,7 +958,7 @@ def _try_mv(self, fs_src_path, fs_dst_path): cmd = "mv {} {}".format(fs_src_path, fs_dst_path) ret = 0 try: - ret, _ = self._run_cmd(cmd) + ret, _ = self._run_cmd(cmd, retry_times=1) if ret != 0: raise ExecuteError(cmd) except Exception as e: @@ -1090,7 +1086,7 @@ def cat(self, fs_path=None): @_handle_errors() def _try_cat(self, fs_path): cmd = "cat {}".format(fs_path) - ret, output = self._run_cmd(cmd) + ret, output = self._run_cmd(cmd, retry_times=1) if ret != 0: raise ExecuteError(cmd) return output diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py index 3e8f0de3e69d5..921f653b48a6d 100644 --- a/python/paddle/distributed/launch/context/__init__.py +++ b/python/paddle/distributed/launch/context/__init__.py @@ -57,8 +57,6 @@ def is_legacy_mode(self): return True legacy_env_list = [ - 'DISTRIBUTED_TRAINER_ENDPOINTS', - 'PADDLE_ELASTIC_JOB_ID', 'FLAGS_START_PORT', ] @@ -103,4 +101,7 @@ def continous_log(self) -> bool: def set_env_in_args(self): for k, v in env_args_mapping.items(): if k in self.envs: + print( + f"LAUNCH WARNNING args {v} is override by env {self.envs[k]}" + ) setattr(self.args, v, self.envs[k]) diff --git a/python/paddle/distributed/launch/context/args_envs.py b/python/paddle/distributed/launch/context/args_envs.py index f6624e88e276d..b44065c670005 100644 --- a/python/paddle/distributed/launch/context/args_envs.py +++ b/python/paddle/distributed/launch/context/args_envs.py @@ -96,9 +96,12 @@ def parse_args(): help="unique id of the job. Default default") base_group.add_argument("--devices", + "--gpus", + "--npus", + "--xpus", type=str, default=None, - help="accelerate devices. as --gpus,npus,xps") + help="accelerate devices. as --gpus,npus,xpus") base_group.add_argument("--host", type=str, default=None, help="host ip") diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py index c48ef04cd0920..14997df24590f 100644 --- a/python/paddle/distributed/launch/context/device.py +++ b/python/paddle/distributed/launch/context/device.py @@ -105,10 +105,9 @@ def parse_device(self): os.getenv('PADDLE_XCCL_BACKEND').upper()) if visible_devices_str in os.environ: visible_devices = os.getenv(visible_devices_str) - elif 'CUDA_VISIBLE_DEVICES' in os.environ or 'NVIDIA_VISIBLE_DEVICES' in os.environ: + elif 'CUDA_VISIBLE_DEVICES' in os.environ: dev._dtype = DeviceType.GPU - visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv( - "NVIDIA_VISIBLE_DEVICES") + visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") elif 'XPU_VISIBLE_DEVICES' in os.environ: dev._dtype = DeviceType.XPU visible_devices = os.getenv("XPU_VISIBLE_DEVICES") @@ -151,8 +150,7 @@ def get_custom_devices_count(device_type): elif fluid.core.is_compiled_with_cuda(): dev._dtype = DeviceType.GPU num = fluid.core.get_cuda_device_count() - visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv( - "NVIDIA_VISIBLE_DEVICES") + visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") elif fluid.core.is_compiled_with_xpu(): dev._dtype = DeviceType.XPU num = fluid.core.get_xpu_device_count() diff --git a/python/paddle/distributed/launch/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py index 8ec21f72ea4de..873cfe09ac8b8 100644 --- a/python/paddle/distributed/launch/controllers/collective.py +++ b/python/paddle/distributed/launch/controllers/collective.py @@ -108,7 +108,9 @@ def build_pod(self): else: e.update({'PADDLE_DISTRI_BACKEND': 'gloo'}) - self.add_container(envs=e, log_tag=i) + # log_file = "{}.{}.{}.log".format(self.job.id, self.pod.name, i) + log_file = f"workerlog.{i}" + self.add_container(envs=e, log_file=log_file) return True diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py index bc628be59dc22..56499cb647134 100644 --- a/python/paddle/distributed/launch/controllers/controller.py +++ b/python/paddle/distributed/launch/controllers/controller.py @@ -92,6 +92,9 @@ def watch(self) -> bool: self.master.set_status(status) + while self.pod.logs(): + pass + self.ctx.logger.info("Pod {}".format(status)) return True @@ -105,6 +108,9 @@ def watch(self) -> bool: fc = self.pod.failed_container() self.ctx.logger.info("Pod {}".format(status)) self.ctx.logger.error("Container failed !!!\n{}".format(fc[0])) + self.ctx.logger.info( + "------------------------- ERROR LOG DETAIL -------------------------" + ) fc[0].tail() if self.ctx.args.elastic_level <= 0: @@ -170,7 +176,11 @@ def build_pod(self) -> bool: raise NotImplementedError def _get_entrypoint(self): - entrypoint = [sys.executable, "-u", self.ctx.args.training_script] + if self.ctx.args.training_script.endswith('.py'): + entrypoint = [sys.executable, "-u", self.ctx.args.training_script] + else: + entrypoint = [self.ctx.args.training_script] + entrypoint.extend(self.ctx.args.training_script_args) return entrypoint @@ -199,13 +209,8 @@ def add_container(self, container=None, entrypoint=None, envs={}, - log_tag=None, + log_file=None, is_init=False): - if not is_init and log_tag is not None: - log_file = "{}.{}.{}.log".format(self.job.id, self.pod.name, - log_tag) - else: - log_file = None if not container: container = self.new_container(entrypoint=entrypoint, diff --git a/python/paddle/distributed/launch/controllers/ps.py b/python/paddle/distributed/launch/controllers/ps.py index 19429ce19614e..573f578d249e1 100644 --- a/python/paddle/distributed/launch/controllers/ps.py +++ b/python/paddle/distributed/launch/controllers/ps.py @@ -84,8 +84,8 @@ def _build_pod_with_args(self): "POD_IP": self.ctx.node.ip, } e.update(_gloo_envs) - log_tag = "ps.{}".format(i) - self.add_container(envs=e, log_tag=log_tag) + log_file = "serverlog.{}".format(i) + self.add_container(envs=e, log_file=log_file) trainer_rank_offset = 0 for s in trainer_endpoints: @@ -106,8 +106,8 @@ def _build_pod_with_args(self): "POD_IP": self.ctx.node.ip, } e.update(_gloo_envs) - log_tag = "trainer.{}".format(i) - self.add_container(envs=e, log_tag=log_tag) + log_file = "workerlog.{}".format(i) + self.add_container(envs=e, log_file=log_file) def _build_pod_with_master(self): @@ -191,8 +191,8 @@ def _build_pod_with_master(self): self.ctx.node.ip, } e.update(_gloo_envs) - log_tag = "ps.{}".format(i) - self.add_container(envs=e, log_tag=log_tag) + log_file = "serverlog.{}".format(i) + self.add_container(envs=e, log_file=log_file) for i in range(trainer_num): e = { @@ -216,8 +216,8 @@ def _build_pod_with_master(self): self.ctx.node.ip, } e.update(_gloo_envs) - log_tag = "trainer.{}".format(i) - self.add_container(envs=e, log_tag=log_tag) + log_file = "workerlog.{}".format(i) + self.add_container(envs=e, log_file=log_file) ''' NEW VERSION for i in range(server_num): e = { diff --git a/python/paddle/distributed/launch/job/container.py b/python/paddle/distributed/launch/job/container.py index e0f580da0ac45..8da5363915ced 100644 --- a/python/paddle/distributed/launch/job/container.py +++ b/python/paddle/distributed/launch/job/container.py @@ -99,7 +99,7 @@ def _get_fd(self, pth): d = os.path.dirname(pth) if not os.path.isdir(d): os.makedirs(d, exist_ok=True) - return open(pth, 'w') + return open(pth, 'a') except: return None @@ -115,11 +115,17 @@ def start(self): elif self._err: self._stderr = self._get_fd(self._err) or sys.stderr + if not self._log_handler: + self._log_handler = open(self._out) + self._log_handler.seek(0, 2) + self._log_start_offset = self._log_handler.tell() + self._proc = ProcessContext(self._entrypoint, env=self._env, out=self._stdout, err=self._stderr, shell=self._shell) + self._proc.start() def terminate(self, force=False): @@ -171,13 +177,16 @@ def logs(self, fn=None, offset=0, whence=1, limit=1000): try: if offset != 0 or whence != 1: + if whence == 0 and offset < self._log_start_offset: + offset = self._log_start_offset self._log_handler.seek(offset, whence) for _ in range(limit): line = self._log_handler.readline() if not line: - break + return False fn.write(line) + return True except: return diff --git a/python/paddle/distributed/launch/plugins/__init__.py b/python/paddle/distributed/launch/plugins/__init__.py index a3a9e8c809a1c..946768db32ca9 100644 --- a/python/paddle/distributed/launch/plugins/__init__.py +++ b/python/paddle/distributed/launch/plugins/__init__.py @@ -32,8 +32,10 @@ def process_args(ctx): argdev = ctx.args.devices if argdev: for d in argdev.split(','): - assert d in ctx.node.device.labels, 'Device not found {}'.format( - argdev) + if d not in ctx.node.device.labels: + ctx.logger.error( + f'Device not found {d} from {argdev} for setting {ctx.node.device.labels}' + ) def collective_compatible(ctx): @@ -44,7 +46,7 @@ def collective_compatible(ctx): ctx.args.nnodes = len(hosts) ctx.logger.info( 'args reset by env PADDLE_TRAINER_ENDPOINTS\n{}'.format(eps)) - ''' + if 'DISTRIBUTED_TRAINER_ENDPOINTS' in ctx.envs: eps = ctx.envs['DISTRIBUTED_TRAINER_ENDPOINTS'].split(',') hosts = set([h.split(':')[0] for h in eps]) @@ -52,7 +54,6 @@ def collective_compatible(ctx): ctx.args.nnodes = len(hosts) ctx.logger.info( 'args reset by env DISTRIBUTED_TRAINER_ENDPOINTS\n{}'.format(eps)) - ''' def rewrite_host_ip(ctx): diff --git a/python/paddle/distributed/ps/coordinator.py b/python/paddle/distributed/ps/coordinator.py index 441ace16830d0..a012f338a514f 100755 --- a/python/paddle/distributed/ps/coordinator.py +++ b/python/paddle/distributed/ps/coordinator.py @@ -23,10 +23,13 @@ import os import logging -logging.basicConfig( - format='%(asctime)s %(levelname)-2s [%(filename)s:%(lineno)d] %(message)s', - level=logging.INFO) logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +formatter = logging.Formatter( + fmt='%(asctime)s %(levelname)-2s [%(filename)s:%(lineno)d] %(message)s') +ch = logging.StreamHandler() +ch.setFormatter(formatter) +logger.addHandler(ch) class ClientInfoAttr: diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py index a99bd6649f0fd..af56556db447f 100755 --- a/python/paddle/distributed/ps/the_one_ps.py +++ b/python/paddle/distributed/ps/the_one_ps.py @@ -591,6 +591,10 @@ def _set(self, table_proto): table_proto.table_class = self.table_class table_proto.type = the_one_ps_pb2.PS_SPARSE_TABLE table_proto.shard_num = self.shard_num + if table_proto.sparse_table_cache_file_num > len( + get_ps_endpoints(self.context['role_maker'])): + table_proto.sparse_table_cache_file_num = len( + get_ps_endpoints(self.context['role_maker'])) self.common.table_name = self.context['grad_name_to_param_name'][ ctx.origin_varnames()[0]] @@ -914,6 +918,7 @@ def _set_basic_info(self, context): self.ps_desc_builder = PsDescBuilder(self.context) def _init_all_params(self, scopes, send_ctx, recv_map): + all_var_names = [] for name, ctx in send_ctx.items(): if ctx.is_sparse(): continue @@ -923,8 +928,11 @@ def _init_all_params(self, scopes, send_ctx, recv_map): var_names = recv_map[table_id] #print("init params:", idx, table_id, var_names) self._worker.push_dense_params(scope, table_id, var_names) + all_var_names.extend(var_names) + return all_var_names def _pull_all_dense(self, scopes, send_ctx, recv_map): + all_var_names = [] for name, ctx in send_ctx.items(): if ctx.is_sparse(): continue @@ -934,8 +942,11 @@ def _pull_all_dense(self, scopes, send_ctx, recv_map): var_names = recv_map[table_id] #print("pull all dense:", idx, table_id, var_names) self._worker.pull_dense_params(scope, table_id, var_names) + all_var_names.extend(var_names) + return all_var_names def _init_params(self, program, scope, send_ctx, recv_map): + all_var_names = [] for name, ctx in send_ctx.items(): if ctx.is_sparse(): continue @@ -945,8 +956,11 @@ def _init_params(self, program, scope, send_ctx, recv_map): var_names = recv_map[table_id] # print("init params:", table_id, var_names) self._worker.push_dense_params(scope, table_id, var_names) + all_var_names.extend(var_names) + return all_var_names def _pull_dense(self, program, scope, send_ctx, recv_map): + all_var_names = [] for name, ctx in send_ctx.items(): if ctx.is_sparse(): continue @@ -956,6 +970,8 @@ def _pull_dense(self, program, scope, send_ctx, recv_map): var_names = recv_map[table_id] # print("pull dense:", table_id, var_names) self._worker.pull_dense_params(scope, table_id, var_names) + all_var_names.extend(var_names) + return all_var_names def _init_worker(self, scopes=None): worker_desc = self.ps_desc_builder.build_worker_desc() @@ -1208,6 +1224,32 @@ def _get_inference_model_path(self, dirname): model_path = os.path.join(dirname, "dnn_plugin") return model_path + def _ps_save_dense_params(self, + executor, + dirname, + scope, + program, + var_names=None): + dense_map = get_the_one_recv_context( + self.context, split_dense_table=self.is_heter_ps_mode) + send_ctx = get_the_one_send_context( + self.context, + split_dense_table=self.is_heter_ps_mode, + use_origin_program=self.is_heter_ps_mode, + ep_list=self.endpoints) + if program is None or len(self.origin_main_programs) == 1: + program = self.origin_main_programs[0] + dense_var_names = self._pull_dense(program, scope, send_ctx, dense_map) + save_var_names = dense_var_names if var_names is None else var_names + vars = [program.global_block().var(i) for i in save_var_names] + import paddle + with paddle.static.scope_guard(scope): + paddle.static.save_vars(executor, + "./", + program, + vars=vars, + filename=dirname) + def _save_sparse_params(self, executor, dirname, context, main_program, mode): distributed_varnames = get_sparse_tablenames(self.origin_main_programs, @@ -1230,49 +1272,9 @@ def _save_sparse_params(self, executor, dirname, context, main_program, def _save_distributed_persistables(self, executor, dirname, - main_program, - mode=0): - - denses = get_the_one_recv_context( - self.context, - is_dense=True, - split_dense_table=self.is_heter_ps_mode, - use_origin_program=True) - sparses = get_the_one_recv_context( - self.context, - is_dense=False, - split_dense_table=self.is_heter_ps_mode, - use_origin_program=True) - - sparse_varnames = self._save_sparse_params(executor, dirname, sparses, - main_program, mode) - - recv_dense_varnames = [] - for id, names in denses.items(): - recv_dense_varnames.extend(names) - self._communicator.pull_dense(denses) - - saved_varnames = sparse_varnames - - remaining_vars = list( - filter(TheOnePSRuntime.__exclude_vars(saved_varnames), - main_program.list_vars())) - - import paddle - for var in remaining_vars: - # if var.name not in recv_dense_varnames: - # continue - tensor = var.get_value() - paddle.save(tensor, - os.path.join(dirname, var.name), - use_binary_format=True) - - def _ps_inference_save_persistables(self, - executor, - dirname, - main_program=None, - mode=0, - **kwargs): + main_program=None, + mode=0, + **kwargs): """ This function filters out all variables with `persistable==True` from the give `main_program` and then saves these variables to the folder `dirname` @@ -1301,9 +1303,6 @@ def _ps_inference_save_persistables(self, "in fleet.save() function, main_program must be as Program type, CompiledProgram is not allowed" ) - # Todo(MrChengmo): Save optimizer status - # self._save_distributed_persistables(executor, dirname, main_program, - # mode) self._worker.save_all_model(dirname, mode) def _ps_inference_save_inference_model(self, @@ -1384,14 +1383,8 @@ def _ps_inference_save_inference_model(self, os.path.join(model_path, var.name), use_binary_format=True) - def _save_inference_model(self, *args, **kwargs): - self._ps_inference_save_inference_model(*args, **kwargs) - - def _save_persistables(self, *args, **kwargs): - self._ps_inference_save_persistables(*args, **kwargs) - def _save_cache_model(self, dirname, **kwargs): - mode = kwargs.get("mode", 0) + mode = kwargs.get("mode", 1) table_id = kwargs.get("table_id", 0) self._worker.client_flush() fleet.util.barrier() @@ -1414,6 +1407,12 @@ def _save_cache_model(self, dirname, **kwargs): fleet.util.barrier() return feasign_num + def _check_save_pre_patch_done(self): + fleet.util.barrier() + if self.role_maker._is_first_worker(): + self._worker.check_save_pre_patch_done() + fleet.util.barrier() + def _load_sparse_params(self, dirname, context, main_program, mode): distributed_varnames = get_sparse_tablenames(self.origin_main_programs, True) @@ -1469,10 +1468,7 @@ def _ps_inference_load_inference_model(self, filter(TheOnePSRuntime.__exclude_vars(loaded_varnames), main_program.list_vars())) - if dirname.startswith("afs:") or dirname.startswith("hdfs:"): - model_path = "./dnn_plugin" - else: - model_path = os.path.join(dirname, "dnn_plugin") + model_path = self._get_inference_model_path(dirname) import paddle for var in remaining_vars: if var.name not in recv_dense_varnames: @@ -1482,14 +1478,40 @@ def _ps_inference_load_inference_model(self, self._init_params(main_program, scope, send_ctx, dense_map) - def _load_distributed_persistables(self, path, mode): - self._worker.load_model(path, mode) + def _save_one_table(self, table_id, path, mode): + if self.role_maker._is_first_worker(): + self._worker.save_one_model(table_id, path, mode) + fleet.util.barrier() - def load_model(self, path, mode): - if mode == 0 or mode == 3: - self._load_distributed_persistables(path, mode) - else: + def _save_dense_params(self, *args, **kwargs): + if self.role_maker._is_first_worker(): + self._ps_save_dense_params(*args, **kwargs) + fleet.util.barrier() + + def _save_persistables(self, *args, **kwargs): + if self.role_maker._is_first_worker(): + self._save_distributed_persistables(*args, **kwargs) + fleet.util.barrier() + + def _save_inference_model(self, *args, **kwargs): + if self.role_maker._is_first_worker(): + self._ps_inference_save_inference_model(*args, **kwargs) + fleet.util.barrier() + + def _load_one_table(self, table_id, path, mode): + if self.role_maker._is_first_worker(): + self._worker.load_one_table(table_id, path, mode) + fleet.util.barrier() + + def _load_persistables(self, path, mode): + if self.role_maker._is_first_worker(): + self._worker.load_model(path, mode) + fleet.util.barrier() + + def _load_inference_model(self, path, mode): + if self.role_maker._is_first_worker(): self._ps_inference_load_inference_model(path, mode) + fleet.util.barrier() def _shrink(self, threshold=None): if threshold is not None: diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py index ad270c1a51733..58fb51b62b9a3 100644 --- a/python/paddle/distributed/sharding/group_sharded.py +++ b/python/paddle/distributed/sharding/group_sharded.py @@ -159,7 +159,7 @@ def check_dtype(param): sync_comm=sync_comm) else: raise ValueError("Please enter the correct level.") - if params_fp16 and isinstance(scaler, paddle.amp.GradScaler): + if isinstance(scaler, paddle.amp.GradScaler): if in_dygraph_mode(): scaler = GroupShardedScaler(scaler) else: diff --git a/python/paddle/distribution/transform.py b/python/paddle/distribution/transform.py index 3fabd27ec3401..d7a512aade2e5 100644 --- a/python/paddle/distribution/transform.py +++ b/python/paddle/distribution/transform.py @@ -590,7 +590,7 @@ def _codomain(self): class ExpTransform(Transform): r"""Exponent transformation with mapping :math:`y = \exp(x)`. - Exapmles: + Examples: .. code-block:: python @@ -1169,7 +1169,7 @@ def _codomain(self): class TanhTransform(Transform): r"""Tanh transformation with mapping :math:`y = \tanh(x)`. - Examples + Examples: .. code-block:: python diff --git a/python/paddle/fft.py b/python/paddle/fft.py index f44111cb76618..4b6a93edc447b 100644 --- a/python/paddle/fft.py +++ b/python/paddle/fft.py @@ -17,7 +17,7 @@ import paddle from .tensor.attribute import is_complex, is_floating_point, is_integer from .tensor.creation import _real_to_complex_dtype, _complex_to_real_dtype -from .fluid.framework import _non_static_mode +from .fluid.framework import _in_legacy_dygraph, in_dygraph_mode from . import _C_ops from .fluid.data_feeder import check_variable_and_dtype from .fluid.layer_helper import LayerHelper @@ -1392,7 +1392,9 @@ def fft_c2c(x, n, axis, norm, forward, name): op_type = 'fft_c2c' check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type) - if _non_static_mode(): + if in_dygraph_mode(): + out = _C_ops.final_state_fft_c2c(x, axes, norm, forward) + elif _in_legacy_dygraph(): attrs = ('axes', axes, 'normalization', norm, 'forward', forward) out = getattr(_C_ops, op_type)(x, *attrs) else: @@ -1426,7 +1428,9 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name): op_type = 'fft_r2c' check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type) - if _non_static_mode(): + if in_dygraph_mode(): + out = _C_ops.final_state_fft_r2c(x, axes, norm, forward, onesided) + elif _in_legacy_dygraph(): attrs = ('axes', axes, 'normalization', norm, 'forward', forward, 'onesided', onesided) out = getattr(_C_ops, op_type)(x, *attrs) @@ -1469,7 +1473,12 @@ def fft_c2r(x, n, axis, norm, forward, name): op_type = 'fft_c2r' check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type) - if _non_static_mode(): + if in_dygraph_mode(): + if n is not None: + out = _C_ops.final_state_fft_c2r(x, axes, norm, forward, n) + else: + out = _C_ops.final_state_fft_c2r(x, axes, norm, forward, 0) + elif _in_legacy_dygraph(): if n is not None: attrs = ('axes', axes, 'normalization', norm, 'forward', forward, 'last_dim_size', n) @@ -1528,7 +1537,9 @@ def fftn_c2c(x, s, axes, norm, forward, name): op_type = 'fft_c2c' check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type) - if _non_static_mode(): + if in_dygraph_mode(): + out = _C_ops.final_state_fft_c2c(x, axes, norm, forward) + elif _in_legacy_dygraph(): attrs = ('axes', axes, 'normalization', norm, 'forward', forward) out = getattr(_C_ops, op_type)(x, *attrs) else: @@ -1579,7 +1590,9 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name): op_type = 'fft_r2c' check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type) - if _non_static_mode(): + if in_dygraph_mode(): + out = _C_ops.final_state_fft_r2c(x, axes, norm, forward, onesided) + elif _in_legacy_dygraph(): attrs = ('axes', axes, 'normalization', norm, 'forward', forward, 'onesided', onesided) out = getattr(_C_ops, op_type)(x, *attrs) @@ -1642,7 +1655,12 @@ def fftn_c2r(x, s, axes, norm, forward, name): op_type = 'fft_c2r' check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type) - if _non_static_mode(): + if in_dygraph_mode(): + if s is not None: + out = _C_ops.final_state_fft_c2r(x, axes, norm, forward, s[-1]) + else: + out = _C_ops.final_state_fft_c2r(x, axes, norm, forward, 0) + elif _in_legacy_dygraph(): if s: attrs = ('axes', axes, 'normalization', norm, 'forward', forward, 'last_dim_size', s[-1]) diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py index 2f155ca0edfc2..9fb14e4e72021 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py @@ -448,8 +448,8 @@ def _optimize_fp32_graph(self, graph): # Disabled due to topology-dependent speed-up graph = self._apply_pass(graph, 'fc_mkldnn_pass') graph = self._apply_pass(graph, 'fc_act_mkldnn_fuse_pass') - graph = self._apply_pass(graph, 'matmul_transpose_reshape_fuse_pass') - graph = self._apply_pass(graph, 'matmul_v2_transpose_reshape_fuse_pass') + graph = self._apply_pass(graph, + 'matmul_transpose_reshape_mkldnn_fuse_pass') graph = self._apply_pass(graph, 'batch_norm_act_fuse_pass') graph = self._apply_pass(graph, 'softplus_activation_mkldnn_fuse_pass') # the following pass should be the last one since it will work on all fused ops. @@ -650,8 +650,6 @@ def _quantize_fp32_graph(self, graph): graph = self._apply_pass(graph, 'scale_matmul_fuse_pass') graph = self._apply_pass(graph, 'reshape_transpose_matmul_mkldnn_fuse_pass') - graph = self._apply_pass( - graph, 'reshape_transpose_matmul_v2_mkldnn_fuse_pass') graph = self._apply_pass(graph, 'cpu_quantize_placement_pass', ['quantize_enabled_op_types'], [self._ops_to_quantize]) diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/fluid/dataloader/dataset.py index 36bffcc7f2ad3..1c4728b422362 100755 --- a/python/paddle/fluid/dataloader/dataset.py +++ b/python/paddle/fluid/dataloader/dataset.py @@ -413,7 +413,7 @@ class Subset(Dataset): Returns: Dataset: A Dataset which is the subset of the original dataset. - Example code: + Examples: .. code-block:: python @@ -452,10 +452,10 @@ def random_split(dataset, lengths, generator=None): lengths (sequence): lengths of splits to be produced generator (Generator, optional): Generator used for the random permutation. Default is None then the DefaultGenerator is used in manual_seed(). - Returns: + Returns: Datasets: A list of subset Datasets, which are the non-overlapping subsets of the original Dataset. - Example code: + Examples: .. code-block:: python diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index 4e22af9cfdb64..b30e3ff1d8527 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -485,8 +485,9 @@ def grad(outputs, inside `inputs`, and the i-th returned Tensor is the sum of gradients of `outputs` with respect to the i-th `inputs`. - Examples 1: + Examples: .. code-block:: python + :name: code-example-1 import paddle @@ -519,8 +520,8 @@ def test_dygraph_grad(create_graph): print(test_dygraph_grad(create_graph=False)) # [2.] print(test_dygraph_grad(create_graph=True)) # [4.] - Examples 2: .. code-block:: python + :name: code-example-2 import paddle diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/error.py b/python/paddle/fluid/dygraph/dygraph_to_static/error.py index 3b868ade4e29b..93670758dae77 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/error.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py @@ -143,6 +143,10 @@ def __getitem__(self, key): return self.suggestion_dict[key] +class Dy2StKeyError(Exception): + pass + + class ErrorData(object): """ Error data attached to an exception which is raised in un-transformed code. @@ -159,7 +163,10 @@ def __init__(self, error_type, error_value, origin_traceback, def create_exception(self): message = self.create_message() - new_exception = self.error_type(message) + if self.error_type is KeyError: + new_exception = Dy2StKeyError(message) + else: + new_exception = self.error_type(message) setattr(new_exception, ERROR_DATA, self) return new_exception diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py index da7cdc7f8f525..2d0d159c5eea6 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py @@ -287,6 +287,74 @@ def _verify_program(self, main_program): return main_program + def prepare_gradient_aggregation(self, main_program, target_program): + """ + Why we need add gradient aggregation operation ? + In some cases, if non leaf nodes are used as output, gradient overwriting will occur, such as + def forward(self, in): + x = 2 * in # <---- x is a non-leaf node in program. + y = x + 3 + return x, y + + loss = forward(in)[0].sum() + loss.backward() # <----- x@grad will be overwrited by elementwise_add_grad Op + """ + + def _need_aggregation(var): + """ + if exist a op whose inputs is var, then return True + """ + if not isinstance(var, framework.Variable) or var.type not in [ + core.VarDesc.VarType.LOD_TENSOR, + core.VarDesc.VarType.SELECTED_ROWS + ]: + return False + if var.dtype not in [paddle.float32, paddle.float64]: + return False + for op in main_program.block(0).ops: + for in_arg in op.input_arg_names: + if in_arg == var.name: + return True + return False + + def _insert_aggregation_ops_for_var(target_program, var): + suffix = "@dy2static" + var_grad_name = var.grad_name + new_grad_name = var.name + suffix + "@GRAD" + finded_ops = list( + filter( + lambda x: any([ + out_arg == var_grad_name + for out_arg in x[1].output_arg_names + ]), enumerate(target_program.block(0).ops))) + + # len(finded_ops) may equals zero when stop_gradient works. + # len(finded_ops) may > 1, because we may have fill_constant op. + if len(finded_ops) == 0: + return None + # step1: create a new var named var.name@GRAD + target_program.block(0).create_var(name=new_grad_name, + type=var.type, + dtype=var.dtype, + shape=var.shape) + # step2: rename the var.name@GRAD to var.name@GRAD@dy2static + for idx, op in finded_ops: + op._rename_input(var_grad_name, new_grad_name) + op._rename_output(var_grad_name, new_grad_name) + # step3: insert sum op to aggregate the gradient. + # var.name@GRAD = sum(var.name@dy2static@GRAD, var.name@GRAD) + target_program.block(0)._insert_op( + finded_ops[-1][0] + 1, + type='sum', + inputs={'X': [var_grad_name, new_grad_name]}, + outputs={"Out": var_grad_name}) + return None + + to_processed_vars = list( + filter(_need_aggregation, self._outputs.tolist())) + for _var in to_processed_vars: + _insert_aggregation_ops_for_var(target_program, _var) + @switch_to_static_graph def _append_backward_desc(self, main_program): # make sure all status of is_test are False in train mode. @@ -299,6 +367,8 @@ def _append_backward_desc(self, main_program): if targets and self._params: backward.gradients(targets=targets, inputs=[]) + self.prepare_gradient_aggregation(main_program, program) + return program def _prune_unused_params(self, program): diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 3a1def85c6eb9..54c2cb29d9262 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -98,6 +98,26 @@ class Layer(object): Returns: None + + Examples: + .. code-block:: python + + import paddle + class MyLayer(paddle.nn.Layer): + def __init__(self): + super(MyLayer, self).__init__() + self._linear = paddle.nn.Linear(1, 1) + self._dropout = paddle.nn.Dropout(p=0.5) + def forward(self, input): + temp = self._linear(input) + temp = self._dropout(temp) + return temp + x = paddle.randn([10, 1], 'float32') + mylayer = MyLayer() + mylayer.eval() # set mylayer._dropout to eval mode + out = mylayer(x) + mylayer.train() # set mylayer._dropout to train mode + out = mylayer(x) """ def __init__(self, name_scope=None, dtype="float32"): diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 64e72670b8e82..f4ee554c19a32 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1187,8 +1187,9 @@ def run(self, results are spliced together in dimension 0 for the same Tensor values (Tensors in fetch_list) on different devices. - Examples 1: + Examples: .. code-block:: python + :name: code-example-1 import paddle import numpy @@ -1215,9 +1216,10 @@ def run(self, print(array_val) # [array([0.02153828], dtype=float32)] - Examples 2: .. code-block:: python + :name: code-example-2 + # required: gpu import paddle import numpy as np @@ -1265,7 +1267,7 @@ def run(self, print("The merged prediction shape: {}".format( np.array(merged_prediction).shape)) print(merged_prediction) - + # Out: # The unmerged prediction shape: (2, 3, 2) # [array([[-0.37620035, -0.19752218], @@ -1417,6 +1419,12 @@ def _can_use_interpreter_core(program, place): if program._build_strategy is not None and program._build_strategy.allow_cuda_graph_capture: return False + # Unsupported case 6: distributed + if program._build_strategy is not None and ( + program._build_strategy.is_distribution + or program._build_strategy.num_trainers > 1): + return False + # Unsupported case 6 : disabled by FLAGS_CONVERT_GRAPH_TO_PROGRAM if os.environ.get('FLAGS_CONVERT_GRAPH_TO_PROGRAM', None) not in [1, '1', True, 'True', 'true']: diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 4ce4801d32bd6..8db5a4353aeeb 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1262,6 +1262,17 @@ def _varbase_creator(type=core.VarDesc.VarType.LOD_TENSOR, True if persistable else False) +def _all_is_type(vals, expected_type): + """ + Return True if type of each element is expected_type. + + NOTE: BuiltIn all() will always return True if vals is empty. + """ + assert isinstance(vals, (list, tuple)) + if not vals: return False + return all(isinstance(v, expected_type) for v in vals) + + class VariableMetaClass(type): @classmethod @@ -2934,7 +2945,28 @@ def _to_readable_code(self, skip_op_callstack=True): if skip_op_callstack and name == "op_callstack": continue - attr_type = self.desc.attr_type(name) + attr_type = self.desc.attr_type(name, True) + if attr_type == core.AttrType.VAR: + attr_var_name = self.desc.attr(name, True).name() + a = "{name} = Var['{value}']".format(name=name, + type=attr_type, + value=attr_var_name) + attrs_str += a + if i != len(attr_names) - 1: + attrs_str += ", " + continue + + if attr_type == core.AttrType.VARS: + attr_var_names = [ + "'%s'" % var.name() for var in self.desc.attr(name, True) + ] + a = "{name} = Vars[{value}]".format( + name=name, type=attr_type, value=','.join(attr_var_names)) + attrs_str += a + if i != len(attr_names) - 1: + attrs_str += ", " + continue + if attr_type == core.AttrType.BLOCK: a = "{name} = block[{value}]".format( name=name, type=attr_type, value=self._block_attr_id(name)) @@ -3128,10 +3160,13 @@ def _update_desc_attr(self, name, val): Raises: ValueError: If the type of value doesn't match with desc.attr_type(name). """ - if isinstance(val, Block): + if isinstance(val, Variable): + self.desc.set_var_attr(name, val.desc) + elif isinstance(val, list) and _all_is_type(val, Variable): + self.desc.set_vars_attr(name, [v.desc for v in val]) + elif isinstance(val, Block): self.desc.set_block_attr(name, val.desc) - elif isinstance(val, list) and val and all( - isinstance(v, Block) for v in val): + elif isinstance(val, list) and val and _all_is_type(val, Block): self.desc.set_blocks_attr(name, [v.desc for v in val]) elif isinstance(val, core.BlockDesc) or \ isinstance(val, core.ProgramDesc): @@ -3141,7 +3176,7 @@ def _update_desc_attr(self, name, val): @property def attr_names(self): - return self.desc.attr_names() + return self.desc.attr_names(True) def attr(self, name): """ @@ -4392,10 +4427,13 @@ def _update_desc_attr(self, name, val): assert self.node.op() is not None, \ "The node operator description can not be None." desc = self.node.op() - if isinstance(val, Block): + if isinstance(val, Variable): + desc.set_var_attr(name, val.desc) + elif isinstance(val, list) and _all_is_type(val, Variable): + desc.set_vars_attr(name, [v.desc for v in val]) + elif isinstance(val, Block): desc.set_block_attr(name, val.desc) - elif isinstance(val, list) and val and \ - all(isinstance(v, Block) for v in val): + elif isinstance(val, list) and val and _all_is_type(val, Block): desc.set_blocks_attr(name, [v.desc for v in val]) elif isinstance(val, core.BlockDesc) or \ isinstance(val, core.ProgramDesc): @@ -4850,10 +4888,13 @@ def _update_desc_attr(self, desc, name, val): """ Update the value of desc's attribute by attribute's name. """ - if isinstance(val, Block): + if isinstance(val, Variable): + desc.set_var_attr(name, val.desc) + elif isinstance(val, list) and _all_is_type(val, Variable): + desc.set_vars_attr(name, [v.desc for v in val]) + elif isinstance(val, Block): desc.set_block_attr(name, val.desc) - elif isinstance(val, list) and val and all( - isinstance(v, Block) for v in val): + elif isinstance(val, list) and val and _all_is_type(val, Block): desc.set_blocks_attr(name, [v.desc for v in val]) elif isinstance(val, core.BlockDesc) or \ isinstance(val, core.ProgramDesc): diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 0e636d8f72dfc..b4c99a7af49c3 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -138,14 +138,18 @@ def __call__(self, var, block=None): or isinstance(var, framework.EagerParamBase)) assert isinstance(block, framework.Block) - if framework._non_static_mode(): + if in_dygraph_mode(): + place = _current_expected_place() + _C_ops.final_state_full_(var, var.shape, str(float(self._value)), + var.dtype, place) + return None + elif _in_legacy_dygraph(): _C_ops.fill_constant(var, 'value', float(self._value), 'force_cpu', self._force_cpu, 'dtype', int(var.dtype), 'str_value', str(float(self._value)), 'shape', var.shape) return None else: - # fill constant should set the "str_value" to preserve precision op = block.append_op(type="fill_constant", outputs={"Out": var}, attrs={ @@ -1187,7 +1191,7 @@ def calculate_gain(nonlinearity, param=None): Examples: .. code-block:: python - :name: code-example1 + import paddle gain = paddle.nn.initializer.calculate_gain('tanh') # 5.0 / 3 gain = paddle.nn.initializer.calculate_gain('leaky_relu', param=1.0) # 1.0 = math.sqrt(2.0 / (1+param^2)) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index db88331040fa7..0f69949018de7 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -184,8 +184,6 @@ def is_belong_to_optimizer(var): @dygraph_not_support def get_program_parameter(program): """ - :api_attr: Static Graph - Get all the parameters from Program. Args: @@ -212,8 +210,6 @@ def get_program_parameter(program): @dygraph_not_support def get_program_persistable_vars(program): """ - :api_attr: Static Graph - Get all the persistable vars from Program. Args: @@ -292,9 +288,7 @@ def save_vars(executor, predicate=None, filename=None): """ - :api_attr: Static Graph - - This API saves specific variables in the `Program` to files. + Save specific variables in the `Program` to files. There are two ways to specify the variables to be saved: set variables in a list and assign it to the `vars`, or use the `predicate` function to select @@ -436,9 +430,7 @@ def name_has_fc(var): @dygraph_not_support def save_params(executor, dirname, main_program=None, filename=None): """ - :api_attr: Static Graph - - This operator saves all parameters from the :code:`main_program` to + Save all parameters from the :code:`main_program` to the folder :code:`dirname` or file :code:`filename`. You can refer to :ref:`api_guide_model_save_reader_en` for more details. @@ -670,9 +662,7 @@ def is_valid(var): @dygraph_not_support def save_persistables(executor, dirname, main_program=None, filename=None): """ - :api_attr: Static Graph - - This operator saves all persistable variables from :code:`main_program` to + Save all persistable variables from :code:`main_program` to the folder :code:`dirname` or file :code:`filename`. You can refer to :ref:`api_guide_model_save_reader_en` for more details. And then saves these persistables variables to the folder :code:`dirname` or file @@ -780,9 +770,6 @@ def load_vars(executor, Returns: None - Raises: - TypeError: If `main_program` is not an instance of Program nor None. - Examples: .. code-block:: python @@ -1247,8 +1234,6 @@ def save_inference_model(dirname, program_only=False, clip_extra=False): """ - :api_attr: Static Graph - Prune the given `main_program` to build a new program especially for inference, and then save it and all related parameters to given `dirname` . If you just want to save parameters of your trained model, please use the @@ -1279,7 +1264,7 @@ def save_inference_model(dirname, params_filename(str, optional): The name of file to save all related parameters. If it is set None, parameters will be saved in separate files . - export_for_deployment(bool): If True, programs are modified to only support + export_for_deployment(bool, optional): If True, programs are modified to only support direct inference deployment. Otherwise, more information will be stored for flexible optimization and re-training. Currently, only @@ -1290,14 +1275,7 @@ def save_inference_model(dirname, Default: False. Returns: - The fetch variables' name list - - Return Type: - list - - Raises: - ValueError: If `feed_var_names` is not a list of basestring, an exception is thrown. - ValueError: If `target_vars` is not a list of Variable, an exception is thrown. + list, The fetch variables' name list. Examples: .. code-block:: python @@ -1462,8 +1440,6 @@ def load_inference_model(dirname, params_filename=None, pserver_endpoints=None): """ - :api_attr: Static Graph - Load the inference model from a given directory. By this API, you can get the model structure(Inference Program) and model parameters. If you just want to load parameters of the pre-trained model, please use the :ref:`api_fluid_io_load_params` API. @@ -1501,8 +1477,6 @@ def load_inference_model(dirname, ``Variable`` (refer to :ref:`api_guide_Program_en`). It contains variables from which we can get inference results. - Raises: - ValueError: If `dirname` is not a existing directory. Examples: .. code-block:: python @@ -1659,12 +1633,6 @@ def get_parameter_value_by_name(name, executor, program=None): Returns: numpy.array: The parameter's values. - Raises: - TypeError: If given `name` is not an instance of basestring. - TypeError: If the parameter with the given name doesn't exist. - AssertionError: If there is a variable named `name` in the - given program but it is not a Parameter. - Examples: .. code-block:: python @@ -2314,8 +2282,6 @@ def _load_vars_with_try_catch(exe, @static_only def set_program_state(program, state_dict): """ - :api_attr: Static Graph - Set program parameter from state_dict An exception will throw if shape or dtype of the parameters is not match. diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py index 0ee2338ac1ae1..61ab466c24362 100644 --- a/python/paddle/fluid/layers/loss.py +++ b/python/paddle/fluid/layers/loss.py @@ -311,7 +311,7 @@ def cross_entropy2(input, label, ignore_index=kIgnoreIndex): def square_error_cost(input, label): r""" - This op accepts input predictions and target label and returns the + Accept input predictions and target label and returns the squared error cost. For predictions label, and target label, the equation is: @@ -325,10 +325,8 @@ def square_error_cost(input, label): label (Tensor): Label tensor, the data type should be float32. Returns: - The tensor storing the element-wise squared error \ - difference between input and label. - - Return type: Tensor. + Tensor, The tensor storing the element-wise squared + error difference between input and label. Examples: diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 03d21035bd510..3ce7c453613b5 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1132,8 +1132,11 @@ def dropout(x, x = fluid.data(name="data", shape=[None, 32, 32], dtype="float32") dropped = fluid.layers.dropout(x, dropout_prob=0.5) """ + if not isinstance(dropout_prob, (float, int, Variable)): + raise TypeError( + "dropout_prob argument should be a number(int|float) or Variable") # fast return for p == 0 - if dropout_prob == 0: + if isinstance(dropout_prob, (int, float)) and dropout_prob == 0: return x if _non_static_mode(): @@ -1152,6 +1155,10 @@ def dropout(x, def get_attrs(prog, dropout_prob, is_test, seed): if (seed is None or seed == 0) and prog.random_seed != 0: seed = prog.random_seed + if isinstance(dropout_prob, Variable) and not dropout_prob.shape != [1]: + raise TypeError( + "Required dropout_prob.shape == [1] if type(dropout_prob) is Variable, but received dropout_prob.shape = {}" + .format(dropout_prob.shape)) attrs = { 'dropout_prob': dropout_prob, 'is_test': is_test, @@ -9359,14 +9366,14 @@ def crop(x, shape=None, offsets=None, name=None): Parameters: x (Variable): Tensor, data type can be float32 or float64. - shape (Variable|list/tuple of integers): The output shape is specified + shape (Variable|list/tuple of integers, optional): The output shape is specified by `shape`, which can be a Tensor or a list/tuple of integers. If it is a Tensor, it's rank must be the same as `x` , only it's shape will be used, and the value of it will be ignored. This way is suitable for the case that the output shape may be changed each iteration. If it is a list/tuple of integers, it's length must be the same as the rank of `x` - offsets (Variable|list/tuple of integers|None): Specifies the cropping + offsets (Variable|list/tuple of integers|None, optional): Specifies the cropping offsets at each dimension. It can be a Tensor or a list/tuple of integers. If it is a Tensor, it's rank must be the same as `x`. This way is suitable for the case that the offsets may be changed @@ -9377,13 +9384,7 @@ def crop(x, shape=None, offsets=None, name=None): None by default. Returns: - The cropped Tensor, which has the same rank and data type with `x` - - Return Type: - Variable - - Raises: - ValueError: If shape is not a list, tuple or Variable. + Tensor, The cropped Tensor, which has the same rank and data type with `x`. Examples: @@ -9721,7 +9722,8 @@ def pad2d(input, name (str, optional) : The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` . - Returns: Tensor, a 4-D Tensor padded according to paddings and mode and data type is same as input. + Returns: + Tensor, a 4-D Tensor padded according to paddings and mode and data type is same as input. Examples: .. code-block:: text @@ -13282,14 +13284,10 @@ def space_to_depth(x, blocksize, name=None): to :ref:`api_guide_Name`. Usually name is no need to set and \ None by default. - Returns: The output, which should be 4 dims Tensor or LodTensor, with the shape \ + Returns: + Tensor, The output, which should be 4 dims Tensor or LodTensor, with the shape \ [batch, channel * blocksize * blocksize, height/blocksize, width/blocksize] - Return Type: Variable - - Raises: - TypeError: blocksize type must be int64. - Examples: .. code-block:: python diff --git a/python/paddle/fluid/layers/sequence_lod.py b/python/paddle/fluid/layers/sequence_lod.py index 4a213a7a146c8..6d8b0992ff014 100644 --- a/python/paddle/fluid/layers/sequence_lod.py +++ b/python/paddle/fluid/layers/sequence_lod.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -54,9 +54,9 @@ def sequence_conv(input, act=None, name=None): r""" - :api_attr: Static Graph - **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use conv2d Op.(fluid.layers.** :ref:`api_fluid_layers_conv2d` ). + Note: + Only receives LoDTensor as input. If your input is Tensor, please use conv2d Op.(fluid.layers.** :ref:`api_fluid_layers_conv2d` ). This operator receives input sequences with variable length and other convolutional configuration parameters(num_filters, filter_size) to apply the convolution operation. @@ -179,11 +179,9 @@ def sequence_conv(input, def sequence_softmax(input, use_cudnn=False, name=None): r""" - :api_attr: Static Graph - **Note**: - - **The input type of the OP must be LoDTensor. For Tensor, use:** :ref:`api_fluid_layers_softmax` + Note: + The input type of the OP must be LoDTensor. For Tensor, use:** :ref:`api_fluid_layers_softmax` A LoD-tensor can be regarded as several sequences, and this op apply softmax algo on each sequence. The shape of input Tensor can be :math:`[N, 1]` or :math:`[N]`, where :math:`N` @@ -264,9 +262,9 @@ def sequence_softmax(input, use_cudnn=False, name=None): def sequence_pool(input, pool_type, is_test=False, pad_value=0.0): r""" - :api_attr: Static Graph - **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use pool2d Op.(fluid.layers.** :ref:`api_fluid_layers_pool2d` ). + Note: + Only receives LoDTensor as input. If your input is Tensor, please use pool2d Op.(fluid.layers.** :ref:`api_fluid_layers_pool2d` ). This operator only supports LoDTensor as input. It will apply specified pooling operation on the input LoDTensor. It pools features of all time-steps of each @@ -381,9 +379,9 @@ def sequence_pool(input, pool_type, is_test=False, pad_value=0.0): @templatedoc() def sequence_concat(input, name=None): """ - :api_attr: Static Graph - **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use concat Op.(fluid.layers.** :ref:`api_fluid_layers_concat` ). + Note: + Only receives LoDTensor as input. If your input is Tensor, please use concat Op.(fluid.layers.** :ref:`api_fluid_layers_concat` ). This operator only supports LoDTensor as input. It concatenates the multiple LoDTensor from input by the LoD information, and outputs the concatenated LoDTensor. @@ -445,9 +443,8 @@ def sequence_concat(input, name=None): def sequence_first_step(input): """ - :api_attr: Static Graph - This operator only supports LoDTensor as input. Given the input LoDTensor, it will + Only supports LoDTensor as input. Given the input LoDTensor, it will select first time-step feature of each sequence as output. .. code-block:: text @@ -503,9 +500,8 @@ def sequence_first_step(input): def sequence_last_step(input): """ - :api_attr: Static Graph - This operator only supports LoDTensor as input. Given the input LoDTensor, it will + Only supports LoDTensor as input. Given the input LoDTensor, it will select last time-step feature of each sequence as output. .. code-block:: text @@ -562,7 +558,6 @@ def sequence_last_step(input): def sequence_slice(input, offset, length, name=None): """ - :api_attr: Static Graph **Sequence Slice Layer** @@ -653,7 +648,6 @@ def sequence_slice(input, offset, length, name=None): def sequence_expand(x, y, ref_level=-1, name=None): r""" - :api_attr: Static Graph Sequence Expand Layer. This layer will expand the input variable ``x`` \ according to specified level ``ref_level`` lod of ``y``. Please note that \ @@ -662,11 +656,13 @@ def sequence_expand(x, y, ref_level=-1, name=None): of ``y``. If the lod level of ``x`` is 0, then the first dim of ``x`` should \ be equal to the size of ``ref_level`` of ``y``. The rank of **x** is at least 2. \ When rank of ``x`` is greater than 2, then it would be viewed as a 2-D tensor. + + Note: - Please note that the input ``x`` should be LodTensor or Tensor, \ + Please note that the input ``x`` should be LodTensor or Tensor, \ and input ``y`` must be LodTensor. - Following examples will explain how sequence_expand works: + **Following examples will explain how sequence_expand works:** .. code-block:: text @@ -722,12 +718,11 @@ def sequence_expand(x, y, ref_level=-1, name=None): to :ref:`api_guide_Name`. Usually name is no need to set and \ None by default. - Returns: The expanded variable which is a LoDTensor, with dims ``[N, K]``. \ + Returns: + Tensor, The expanded variable which is a LoDTensor, with dims ``[N, K]``. \ ``N`` depends on the lod info of ``x`` and ``y``. \ The data type is same as input. - Return Type: Variable - Examples: .. code-block:: python @@ -791,7 +786,6 @@ def sequence_expand(x, y, ref_level=-1, name=None): def sequence_expand_as(x, y, name=None): r""" - :api_attr: Static Graph Sequence Expand As Layer. This OP will expand the input variable ``x`` \ according to the zeroth level lod of ``y``. Current implementation requires \ @@ -800,7 +794,8 @@ def sequence_expand_as(x, y, name=None): the expanded LodTensor has the same lod info as ``y``. The expanded result \ has nothing to do with ``x``'s lod, so the lod of Input(X) is not considered. - Please note that the input ``x`` should be LodTensor or Tensor, \ + Note: + Please note that the input ``x`` should be LodTensor or Tensor, \ and input ``y`` must be LodTensor. Following examples will explain how sequence_expand_as works: @@ -845,12 +840,11 @@ def sequence_expand_as(x, y, name=None): to :ref:`api_guide_Name`. Usually name is no need to set and \ None by default. - Returns: The expanded variable which is a LoDTensor with the dims ``[N, K]``. \ + Returns: + Tensor, The expanded variable which is a LoDTensor with the dims ``[N, K]``. \ ``N`` depends on the lod of ``y``, and the lod level must be 1. \ The data type is same as input. - Return Type: Variable - Examples: .. code-block:: python @@ -913,7 +907,6 @@ def sequence_expand_as(x, y, name=None): def sequence_pad(x, pad_value, maxlen=None, name=None): r""" - :api_attr: Static Graph This layer padding the sequences in a same batch to a common length (according to ``maxlen``). The padding value is defined by ``pad_value``, and will be @@ -921,6 +914,7 @@ def sequence_pad(x, pad_value, maxlen=None, name=None): the LodTensor ``Out`` is the padded sequences, and LodTensor ``Length`` is the length information of input sequences. For removing padding data (unpadding operation), See :ref:`api_fluid_layers_sequence_unpad`. + Note: Please note that the input ``x`` should be LodTensor. .. code-block:: text @@ -978,13 +972,12 @@ def sequence_pad(x, pad_value, maxlen=None, name=None): to :ref:`api_guide_Name`. Usually name is no need to set and \ None by default. - Returns: A Python tuple (Out, Length): the 1st is a 0 level LodTensor \ + Returns: + tuple, A Python tuple (Out, Length): the 1st is a 0 level LodTensor \ ``Out``, with the shape ``[batch_size, maxlen, K]``; the second is the original \ sequences length infor ``Length``, which should be a 0-level 1D LodTensor. \ The size of ``Length`` is equal to batch size, and the data type is int64. - Return Type: tuple - Examples: .. code-block:: python @@ -1031,13 +1024,11 @@ def sequence_pad(x, pad_value, maxlen=None, name=None): def sequence_unpad(x, length, name=None): """ - :api_attr: Static Graph - **Note**: - - **The input of the OP is Tensor and the output is LoDTensor. For padding operation, See:** :ref:`api_fluid_layers_sequence_pad` + Note: + The input of the OP is Tensor and the output is LoDTensor. For padding operation, See:** :ref:`api_fluid_layers_sequence_pad` - The OP removes the padding data from the input based on the length information and returns a LoDTensor. + Remove the padding data from the input based on the length information and returns a LoDTensor. .. code-block:: text @@ -1109,11 +1100,11 @@ def sequence_unpad(x, length, name=None): def sequence_reshape(input, new_dim): """ - :api_attr: Static Graph - **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use reshape Op.(fluid.layers.** :ref:`api_fluid_layers_reshape` ). + Note: + Only receives LoDTensor as input. If your input is Tensor, please use reshape Op.(fluid.layers.** :ref:`api_fluid_layers_reshape` ). - This operator only supports LoDTensor as input. Given :attr:`new_dim` , + Only supports LoDTensor as input. Given :attr:`new_dim` , it will compute new shape according to original length of each sequence, original dimensions and :attr:`new_dim` . Then it will output a new LoDTensor containing :attr:`new_dim` . Currently it only supports 1-level LoDTensor. @@ -1172,11 +1163,9 @@ def sequence_reshape(input, new_dim): def sequence_scatter(input, index, updates, name=None): """ - :api_attr: Static Graph - **Note**: - - **The index and updates parameters of the OP must be LoDTensor.** + Note: + The index and updates parameters of the OP must be LoDTensor. Plus the updates data to the corresponding input according to the index. @@ -1264,7 +1253,6 @@ def sequence_scatter(input, index, updates, name=None): def sequence_enumerate(input, win_size, pad_value=0, name=None): r""" - :api_attr: Static Graph Generate a new sequence for the input index sequence with \ shape ``[d_1, win_size]``, which enumerates all the \ @@ -1300,12 +1288,11 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None): to :ref:`api_guide_Name`. Usually name is no need to set and \ None by default. - Returns: The enumerate sequence variable which is a LoDTensor with \ + Returns: + Tensor, The enumerate sequence variable which is a LoDTensor with \ shape ``[d_1, win_size]`` and 1-level lod info. \ The data type is same as ``input``. - Return Type: Variable - Examples: .. code-block:: python @@ -1371,12 +1358,11 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None): to :ref:`api_guide_Name`. Usually name is no need to set and \ None by default. - Returns: The output sequence mask. Tensor with shape [d_1, d_2, ..., d_n, maxlen] \ - and data type of :code:`dtype`. The data type should be bool, float32, float64, int8, \ + Returns: + Tensor, The output sequence mask. Tensor with shape [d_1, d_2, ..., d_n, maxlen] + and data type of :code:`dtype`. The data type should be bool, float32, float64, int8, int32 or int64. - Return Type: Tensor - Examples: .. code-block:: python @@ -1398,9 +1384,10 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None): @templatedoc() def sequence_reverse(x, name=None): """ - **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use reverse Op.(fluid.layers.** :ref:`api_fluid_layers_reverse` ). + Note: + Only receives LoDTensor as input. If your input is Tensor, please use reverse Op.(fluid.layers.** :ref:`api_fluid_layers_reverse` ). - This operator only supports LoDTensor as input. It will reverse each sequence for input LoDTensor. + Only supports LoDTensor as input. It will reverse each sequence for input LoDTensor. Currently it only supports 1-level LoDTensor. This operator is very useful when building a reverse :ref:`api_fluid_layers_DynamicRNN` network. diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 6862ac1fd6634..dbbc207fba401 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -390,9 +390,7 @@ def concat(input, axis=0, name=None): attrs = {} if isinstance(axis, Variable): axis.stop_gradient = True - inputs['AxisTensor'] = axis - else: - attrs['axis'] = axis + attrs['axis'] = axis helper.append_op(type='concat', inputs=inputs, diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index 9ee27e0c3cfe9..4bc6be7e09387 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -172,7 +172,7 @@ def update(self, preds, labels): None Return types: - None + None """ raise NotImplementedError( diff --git a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py index 130f74c06d554..55b3fad1f3519 100644 --- a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py +++ b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py @@ -43,9 +43,10 @@ def test_custom_kernel_dot_run(self): y = paddle.to_tensor(y_data) out = paddle.dot(x, y) - self.assertTrue( - np.array_equal(out.numpy(), result), - "custom kernel dot out: {},\n numpy dot out: {}".format( + np.testing.assert_array_equal( + out.numpy(), + result, + err_msg='custom kernel dot out: {},\n numpy dot out: {}'.format( out.numpy(), result)) @@ -72,9 +73,10 @@ def test_custom_kernel_dot_run(self): y = paddle.to_tensor(y_data) out = paddle.dot(x, y) - self.assertTrue( - np.array_equal(out.numpy(), result), - "custom kernel dot out: {},\n numpy dot out: {}".format( + np.testing.assert_array_equal( + out.numpy(), + result, + err_msg='custom kernel dot out: {},\n numpy dot out: {}'.format( out.numpy(), result)) diff --git a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py index a4def8df9e08c..4ca05909fb17a 100644 --- a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py +++ b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py @@ -65,9 +65,10 @@ def test_custom_kernel_dot_load(self): y = paddle.to_tensor(y_data) out = paddle.dot(x, y) - self.assertTrue( - np.array_equal(out.numpy(), result), - "custom kernel dot out: {},\n numpy dot out: {}".format( + np.testing.assert_array_equal( + out.numpy(), + result, + err_msg='custom kernel dot out: {},\n numpy dot out: {}'.format( out.numpy(), result)) def tearDown(self): diff --git a/python/paddle/fluid/tests/custom_op/test_context_pool.py b/python/paddle/fluid/tests/custom_op/test_context_pool.py index d4a079ee4fe10..69b8b18559ef9 100644 --- a/python/paddle/fluid/tests/custom_op/test_context_pool.py +++ b/python/paddle/fluid/tests/custom_op/test_context_pool.py @@ -51,7 +51,7 @@ def use_context_pool(self): x = paddle.ones([2, 2], dtype='float32') out = custom_ops.context_pool_test(x) - self.assertTrue(np.array_equal(x.numpy(), out.numpy())) + np.testing.assert_array_equal(x.numpy(), out.numpy()) def test_using_context_pool(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py index 953ca5519060f..a0be75c0a41eb 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py @@ -66,7 +66,7 @@ def func_attr_value(self): out.stop_gradient = False out.backward() - self.assertTrue(np.array_equal(x.numpy(), out.numpy())) + np.testing.assert_array_equal(x.numpy(), out.numpy()) def test_attr_value(self): with _test_eager_guard(): @@ -85,7 +85,7 @@ def func_const_attr_value(self): out.stop_gradient = False out.backward() - self.assertTrue(np.array_equal(x.numpy(), out.numpy())) + np.testing.assert_array_equal(x.numpy(), out.numpy()) def test_const_attr_value(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/custom_op/test_custom_concat.py b/python/paddle/fluid/tests/custom_op/test_custom_concat.py index 83be96a95a85d..ae3022411b18e 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_concat.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_concat.py @@ -112,9 +112,10 @@ def setUp(self): self.axises = [0, 1] def check_output(self, out, pd_out, name): - self.assertTrue( - np.array_equal(out, pd_out), - "custom op {}: {},\n paddle api {}: {}".format( + np.testing.assert_array_equal( + out, + pd_out, + err_msg='custom op {}: {},\n paddle api {}: {}'.format( name, out, name, pd_out)) def func_dynamic(self): diff --git a/python/paddle/fluid/tests/custom_op/test_custom_conj.py b/python/paddle/fluid/tests/custom_op/test_custom_conj.py index ea916ff55ecab..a389a72df7350 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_conj.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_conj.py @@ -97,9 +97,10 @@ def setUp(self): self.shape = [2, 20, 2, 3] def check_output(self, out, pd_out, name): - self.assertTrue( - np.array_equal(out, pd_out), - "custom op {}: {},\n paddle api {}: {}".format( + np.testing.assert_array_equal( + out, + pd_out, + err_msg='custom op {}: {},\n paddle api {}: {}'.format( name, out, name, pd_out)) def run_dynamic(self, dtype, np_input): diff --git a/python/paddle/fluid/tests/custom_op/test_custom_linear.py b/python/paddle/fluid/tests/custom_op/test_custom_linear.py index 2309751659afe..3ae650ee9494d 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_linear.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_linear.py @@ -97,9 +97,10 @@ def setUp(self): self.np_bias = np.ones([4], dtype="float32") def check_output(self, out, pd_out, name): - self.assertTrue( - np.array_equal(out, pd_out), - "custom op {}: {},\n paddle api {}: {}".format( + np.testing.assert_array_equal( + out, + pd_out, + err_msg='custom op {}: {},\n paddle api {}: {}'.format( name, out, name, pd_out)) def test_static(self): diff --git a/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py b/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py index f95f57b4b7a99..3cd550c95f00e 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py @@ -83,7 +83,7 @@ def test_static(self): y1_value, y2_value = exe.run(paddle.static.default_main_program(), feed={x.name: x_np}, fetch_list=[y1, y2]) - self.assertTrue(np.array_equal(y1_value, y2_value)) + np.testing.assert_array_equal(y1_value, y2_value) paddle.disable_static() diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py index ff0b11128a4f0..18f3252e6e145 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py @@ -121,12 +121,11 @@ def func_train_eval(self): if _in_legacy_dygraph(): custom_relu_dy2stat_train_out = self.train_model( use_custom_op=True, dy2stat=True) # for to_static - self.assertTrue( - np.array_equal(origin_relu_train_out, - custom_relu_dy2stat_train_out)) + np.testing.assert_array_equal(origin_relu_train_out, + custom_relu_dy2stat_train_out) - self.assertTrue( - np.array_equal(origin_relu_train_out, custom_relu_train_out)) + np.testing.assert_array_equal(origin_relu_train_out, + custom_relu_train_out) # for eval origin_relu_eval_out = self.eval_model(use_custom_op=False) @@ -134,12 +133,11 @@ def func_train_eval(self): if _in_legacy_dygraph(): custom_relu_dy2stat_eval_out = self.eval_model( use_custom_op=True, dy2stat=True) # for to_static - self.assertTrue( - np.array_equal(origin_relu_eval_out, - custom_relu_dy2stat_eval_out)) + np.testing.assert_array_equal(origin_relu_eval_out, + custom_relu_dy2stat_eval_out) - self.assertTrue( - np.array_equal(origin_relu_eval_out, custom_relu_eval_out)) + np.testing.assert_array_equal(origin_relu_eval_out, + custom_relu_eval_out) def test_train_eval(self): with _test_eager_guard(): @@ -243,11 +241,10 @@ def test_train_eval(self): use_custom_op=True, use_pe=True) - self.assertTrue( - np.array_equal(original_relu_train_out, custom_relu_train_out)) - self.assertTrue( - np.array_equal(original_relu_train_pe_out, - custom_relu_train_pe_out)) + np.testing.assert_array_equal(original_relu_train_out, + custom_relu_train_out) + np.testing.assert_array_equal(original_relu_train_pe_out, + custom_relu_train_pe_out) # for eval original_relu_eval_out = self.eval_model(device, @@ -261,11 +258,10 @@ def test_train_eval(self): use_custom_op=True, use_pe=True) - self.assertTrue( - np.array_equal(original_relu_eval_out, custom_relu_eval_out)) - self.assertTrue( - np.array_equal(original_relu_eval_pe_out, - custom_relu_eval_pe_out)) + np.testing.assert_array_equal(original_relu_eval_out, + custom_relu_eval_out) + np.testing.assert_array_equal(original_relu_eval_pe_out, + custom_relu_eval_pe_out) def train_model(self, device, use_custom_op=False, use_pe=False): # reset random seed diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py index 5052a0989bb89..f01a737a3b1c0 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py @@ -71,10 +71,11 @@ def test_static(self): out = custom_relu_static(custom_op, device, dtype, x) pd_out = custom_relu_static(custom_op, device, dtype, x, False) - self.assertTrue( - np.array_equal(out, pd_out), - "custom op out: {},\n paddle api out: {}".format( - out, pd_out)) + np.testing.assert_array_equal( + out, + pd_out, + err_msg='custom op out: {},\n paddle api out: {}'. + format(out, pd_out)) def func_dynamic(self): for device in self.devices: @@ -87,14 +88,16 @@ def func_dynamic(self): x) pd_out, pd_x_grad = custom_relu_dynamic( custom_op, device, dtype, x, False) - self.assertTrue( - np.array_equal(out, pd_out), - "custom op out: {},\n paddle api out: {}".format( - out, pd_out)) - self.assertTrue( - np.array_equal(x_grad, pd_x_grad), - "custom op x grad: {},\n paddle api x grad: {}".format( - x_grad, pd_x_grad)) + np.testing.assert_array_equal( + out, + pd_out, + err_msg='custom op out: {},\n paddle api out: {}'. + format(out, pd_out)) + np.testing.assert_array_equal( + x_grad, + pd_x_grad, + err_msg='custom op x grad: {},\n paddle api x grad: {}'. + format(x_grad, pd_x_grad)) def test_dynamic(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py index 1a53bf3354f36..0cc1b19e654bf 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py @@ -224,10 +224,11 @@ def test_static(self): out = custom_relu_static(custom_op, device, dtype, x) pd_out = custom_relu_static(custom_op, device, dtype, x, False) - self.assertTrue( - np.array_equal(out, pd_out), - "custom op out: {},\n paddle api out: {}".format( - out, pd_out)) + np.testing.assert_array_equal( + out, + pd_out, + err_msg='custom op out: {},\n paddle api out: {}'. + format(out, pd_out)) def test_static_pe(self): for device in self.devices: @@ -239,10 +240,11 @@ def test_static_pe(self): out = custom_relu_static_pe(custom_op, device, dtype, x) pd_out = custom_relu_static_pe(custom_op, device, dtype, x, False) - self.assertTrue( - np.array_equal(out, pd_out), - "custom op out: {},\n paddle api out: {}".format( - out, pd_out)) + np.testing.assert_array_equal( + out, + pd_out, + err_msg='custom op out: {},\n paddle api out: {}'. + format(out, pd_out)) def func_dynamic(self): for device in self.devices: @@ -255,14 +257,16 @@ def func_dynamic(self): x) pd_out, pd_x_grad = custom_relu_dynamic( custom_op, device, dtype, x, False) - self.assertTrue( - np.array_equal(out, pd_out), - "custom op out: {},\n paddle api out: {}".format( - out, pd_out)) - self.assertTrue( - np.array_equal(x_grad, pd_x_grad), - "custom op x grad: {},\n paddle api x grad: {}".format( - x_grad, pd_x_grad)) + np.testing.assert_array_equal( + out, + pd_out, + err_msg='custom op out: {},\n paddle api out: {}'. + format(out, pd_out)) + np.testing.assert_array_equal( + x_grad, + pd_x_grad, + err_msg='custom op x grad: {},\n paddle api x grad: {}'. + format(x_grad, pd_x_grad)) def test_dynamic(self): with _test_eager_guard(): @@ -286,10 +290,11 @@ def test_static_save_and_load_inference_model(self): predict_infer = exe.run(inference_program, feed={feed_target_names[0]: np_data}, fetch_list=fetch_targets) - self.assertTrue( - np.array_equal(predict, predict_infer), - "custom op predict: {},\n custom op infer predict: {}". - format(predict, predict_infer)) + np.testing.assert_array_equal( + predict, + predict_infer, + err_msg='custom op predict: {},\n custom op infer predict: {}' + .format(predict, predict_infer)) paddle.disable_static() def test_static_save_and_run_inference_predictor(self): @@ -331,14 +336,16 @@ def test_func_double_grad_dynamic(self): self.custom_ops[0], device, dtype, x) pd_out, pd_dx_grad = custom_relu_double_grad_dynamic( self.custom_ops[0], device, dtype, x, False) - self.assertTrue( - np.array_equal(out, pd_out), - "custom op out: {},\n paddle api out: {}".format( + np.testing.assert_array_equal( + out, + pd_out, + err_msg='custom op out: {},\n paddle api out: {}'.format( out, pd_out)) - self.assertTrue( - np.array_equal(dx_grad, pd_dx_grad), - "custom op dx grad: {},\n paddle api dx grad: {}".format( - dx_grad, pd_dx_grad)) + np.testing.assert_array_equal( + dx_grad, + pd_dx_grad, + err_msg='custom op dx grad: {},\n paddle api dx grad: {}'. + format(dx_grad, pd_dx_grad)) def test_with_dataloader(self): for device in self.devices: @@ -357,9 +364,10 @@ def test_with_dataloader(self): for batch_id, (image, _) in enumerate(train_loader()): out = self.custom_ops[0](image) pd_out = paddle.nn.functional.relu(image) - self.assertTrue( - np.array_equal(out, pd_out), - "custom op out: {},\n paddle api out: {}".format( + np.testing.assert_array_equal( + out, + pd_out, + err_msg='custom op out: {},\n paddle api out: {}'.format( out, pd_out)) if batch_id == 5: diff --git a/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py b/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py index 4202545759cfd..e7d60fd4296eb 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py @@ -46,9 +46,11 @@ def func_slice_output(self): x = paddle.to_tensor(np_x) custom_op_out = custom_ops.custom_simple_slice(x, 2, 3) np_out = np_x[2:3] - self.assertTrue( - np.array_equal(custom_op_out, np_out), - "custom op: {},\n numpy: {}".format(np_out, custom_op_out.numpy())) + np.testing.assert_array_equal( + custom_op_out, + np_out, + err_msg='custom op: {},\n numpy: {}'.format(np_out, + custom_op_out.numpy())) def test_slice_output(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py index d48d25ea3b1c1..ff5192db7aafc 100644 --- a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py @@ -47,9 +47,10 @@ def run_dispatch_test_impl(self, func, dtype): np_x = x.numpy() np_out = out.numpy() self.assertTrue(dtype in str(np_out.dtype)) - self.assertTrue( - np.array_equal(np_x, np_out), - "custom op x: {},\n custom op out: {}".format(np_x, np_out)) + np.testing.assert_array_equal( + np_x, + np_out, + err_msg='custom op x: {},\n custom op out: {}'.format(np_x, np_out)) def run_dispatch_test(self, func, dtype): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py b/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py index 83731de32a4f0..0a0a2e8e6e3fe 100644 --- a/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py @@ -70,14 +70,12 @@ def check_multi_outputs(self, outs, is_dynamic=False): one_int32 = one_int32.numpy() # Fake_float64 self.assertTrue('float64' in str(zero_float64.dtype)) - self.assertTrue( - np.array_equal(zero_float64, - np.zeros([4, 8]).astype('float64'))) + np.testing.assert_array_equal(zero_float64, + np.zeros([4, 8]).astype('float64')) # ZFake_int32 self.assertTrue('int32' in str(one_int32.dtype)) - self.assertTrue( - np.array_equal(one_int32, - np.ones([4, 8]).astype('int32'))) + np.testing.assert_array_equal(one_int32, + np.ones([4, 8]).astype('int32')) def test_static(self): paddle.enable_static() diff --git a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt index 8056f46895892..b825805bdf986 100644 --- a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt +++ b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt @@ -1,4 +1,4 @@ -if(WITH_CUSTOM_DEVICE) +if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU) file( GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py index dcdb7d2d1206e..bf1effe21917f 100644 --- a/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py +++ b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py @@ -144,21 +144,21 @@ def _test_eager_copy_to(self): place=paddle.CPUPlace()) custom_cpu_tensor = cpu_tensor._copy_to( paddle.CustomPlace('custom_cpu', 0), True) - self.assertTrue(np.array_equal(custom_cpu_tensor, x)) + np.testing.assert_array_equal(custom_cpu_tensor, x) self.assertTrue(custom_cpu_tensor.place.is_custom_place()) # custom -> custom another_custom_cpu_tensor = custom_cpu_tensor._copy_to( paddle.CustomPlace('custom_cpu', 0), True) - self.assertTrue(np.array_equal(another_custom_cpu_tensor, x)) + np.testing.assert_array_equal(another_custom_cpu_tensor, x) self.assertTrue(another_custom_cpu_tensor.place.is_custom_place()) # custom -> cpu another_cpu_tensor = custom_cpu_tensor._copy_to(paddle.CPUPlace(), True) - self.assertTrue(np.array_equal(another_cpu_tensor, x)) + np.testing.assert_array_equal(another_cpu_tensor, x) self.assertTrue(another_cpu_tensor.place.is_cpu_place()) # custom -> custom self another_custom_cpu_tensor = another_custom_cpu_tensor._copy_to( paddle.CustomPlace('custom_cpu', 0), True) - self.assertTrue(np.array_equal(another_custom_cpu_tensor, x)) + np.testing.assert_array_equal(another_custom_cpu_tensor, x) self.assertTrue(another_custom_cpu_tensor.place.is_custom_place()) def _test_fallback_kernel(self): @@ -168,7 +168,7 @@ def _test_fallback_kernel(self): x = paddle.to_tensor([5, 4, 3], 'int16') y = paddle.to_tensor([1, 2, 3], 'int16') z = paddle.add(x, y) - self.assertTrue(np.array_equal(z, r)) + np.testing.assert_array_equal(z, r) def tearDown(self): del os.environ['CUSTOM_DEVICE_ROOT'] diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index 046aa4c1f1726..420f3d40affcd 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -596,7 +596,7 @@ def test_generate_proposals(self): 'var': variances_np }, fetch_list=[rois, roi_probs, rois_num], - with_lod=True) + with_lod=False) with self.dynamic_graph(): scores_dy = base.to_variable(scores_np) @@ -617,9 +617,9 @@ def test_generate_proposals(self): roi_probs_dy = roi_probs.numpy() rois_num_dy = rois_num.numpy() - self.assertTrue(np.array_equal(np.array(rois_stat), rois_dy)) - self.assertTrue(np.array_equal(np.array(roi_probs_stat), roi_probs_dy)) - self.assertTrue(np.array_equal(np.array(rois_num_stat), rois_num_dy)) + np.testing.assert_array_equal(np.array(rois_stat), rois_dy) + np.testing.assert_array_equal(np.array(roi_probs_stat), roi_probs_dy) + np.testing.assert_array_equal(np.array(rois_num_stat), rois_num_dy) class TestYoloDetection(unittest.TestCase): @@ -837,8 +837,8 @@ def test_collect_fpn_proposals(self): fpn_rois_dy = fpn_rois_dy.numpy() rois_num_dy = rois_num_dy.numpy() - self.assertTrue(np.array_equal(fpn_rois_stat, fpn_rois_dy)) - self.assertTrue(np.array_equal(rois_num_stat, rois_num_dy)) + np.testing.assert_array_equal(fpn_rois_stat, fpn_rois_dy) + np.testing.assert_array_equal(rois_num_stat, rois_num_dy) def test_collect_fpn_proposals_error(self): @@ -932,7 +932,7 @@ def test_distribute_fpn_proposals(self): output_dy_np.append(output_np) for res_stat, res_dy in zip(output_stat_np, output_dy_np): - self.assertTrue(np.array_equal(res_stat, res_dy)) + np.testing.assert_array_equal(res_stat, res_dy) def test_distribute_fpn_proposals_error(self): program = Program() diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py index cc97b0eb5aea4..eac7feb7775f0 100644 --- a/python/paddle/fluid/tests/test_lod_tensor.py +++ b/python/paddle/fluid/tests/test_lod_tensor.py @@ -71,11 +71,9 @@ def test_create_lod_tensor(self): correct_recursive_seq_lens) self.assertEqual(tensor._dtype(), core.VarDesc.VarType.INT64) self.assertEqual(tensor.shape(), [5, 1]) - self.assertTrue( - np.array_equal( - np.array(tensor), - np.array([1, 2, 3, 3, - 4]).reshape(tensor.shape()).astype('int64'))) + np.testing.assert_array_equal( + np.array(tensor), + np.array([1, 2, 3, 3, 4]).reshape(tensor.shape()).astype('int64')) # Create LoDTensor from numpy array data = np.random.random([10, 1]).astype('float64') @@ -85,7 +83,7 @@ def test_create_lod_tensor(self): recursive_seq_lens) self.assertEqual(tensor._dtype(), core.VarDesc.VarType.FP64) self.assertEqual(tensor.shape(), [10, 1]) - self.assertTrue(np.array_equal(np.array(tensor), data)) + np.testing.assert_array_equal(np.array(tensor), data) # Create LoDTensor from another LoDTensor, they are differnt instances new_recursive_seq_lens = [[2, 2, 1], [1, 2, 2, 3, 2]] @@ -133,9 +131,9 @@ def test_dlpack_support(self): dltensor = tensor._to_dlpack() tensor_from_dlpack = fluid.core.from_dlpack(dltensor) self.assertTrue(isinstance(tensor_from_dlpack, fluid.core.Tensor)) - self.assertTrue( - np.array_equal(np.array(tensor_from_dlpack), - np.array([[1], [2], [3], [4]]).astype('int'))) + np.testing.assert_array_equal( + np.array(tensor_from_dlpack), + np.array([[1], [2], [3], [4]]).astype('int')) # when build with cuda if core.is_compiled_with_cuda(): gtensor = fluid.create_lod_tensor( @@ -144,9 +142,9 @@ def test_dlpack_support(self): gdltensor = gtensor._to_dlpack() gtensor_from_dlpack = fluid.core.from_dlpack(gdltensor) self.assertTrue(isinstance(gtensor_from_dlpack, fluid.core.Tensor)) - self.assertTrue( - np.array_equal(np.array(gtensor_from_dlpack), - np.array([[1], [2], [3], [4]]).astype('int'))) + np.testing.assert_array_equal( + np.array(gtensor_from_dlpack), + np.array([[1], [2], [3], [4]]).astype('int')) def test_as_type(self): tensor = fluid.create_lod_tensor( diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py index 2bf2f887e9d17..734bd7acf9dec 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py @@ -215,6 +215,215 @@ def make_program(): dist_context, cluster) self.assertTrue(dist_op_cost) + def test_dist_op_cost_part3(self): + + def make_program(): + main_program = paddle.static.Program() + start_program = paddle.static.Program() + with paddle.static.program_guard(main_program, start_program): + x = paddle.static.data(name='x', shape=[4], dtype='float32') + x.stop_gradient = True + label = paddle.static.data(name="label", + shape=[8, 1], + dtype='float32') + label.stop_gradient = True + auto.shard_tensor(x, + dist_attr={ + "process_mesh": auto.ProcessMesh([0, 1]), + "dims_mapping": [0] + }) + + auto.shard_tensor(label, + dist_attr={ + "process_mesh": auto.ProcessMesh([0, 1]), + "dims_mapping": [0, -1] + }) + # embedding + tmp = paddle.fluid.layers.fill_constant_batch_size_like( + input=x, shape=[4], value=1, dtype='int32') + embedding = paddle.nn.Embedding(10, 8) + out = embedding(tmp) + # row parallel embedding + for op in main_program.global_block().ops: + if op.type == "lookup_table_v2": + W = main_program.global_block().vars[op.input("W")[0]] + auto.shard_tensor(W, + dist_attr={ + "process_mesh": + auto.ProcessMesh([0, 1]), + "dims_mapping": [0, -1] + }) + out = paddle.fluid.layers.transpose(out, + [1, 0]) # [8, 2] [-1, 0] + + # matmul_v2 + param1 = paddle.fluid.layers.create_parameter( + [4, 8], paddle.float32) # [2, 8] [0, -1] + auto.shard_tensor(param1, + dist_attr={ + "process_mesh": auto.ProcessMesh([0, 1]), + "dims_mapping": [0, -1] + }) + param2 = paddle.fluid.layers.create_parameter( + [8, 8], paddle.float32) # [8, 4] [-1, 0] + auto.shard_tensor(param2, + dist_attr={ + "process_mesh": auto.ProcessMesh([0, 1]), + "dims_mapping": [-1, 0] + }) + out1 = paddle.matmul(out, param1) # [8, 8] [-1, -1] + tmp_param = paddle.fluid.layers.create_parameter( + [8, 8], paddle.float32) # [8, 8] [-1, -1] + auto.shard_tensor(param2, + dist_attr={ + "process_mesh": auto.ProcessMesh([0, 1]), + "dims_mapping": [-1, -1] + }) + tmp_out = paddle.matmul(out1, tmp_param) + out2 = paddle.matmul(tmp_out, param2) # [8, 4] [-1, 0] + + out8 = paddle.fluid.layers.transpose(out2, + [1, 0]) # [4, 8] [0, -1] + + # reshape + out9 = paddle.reshape(out8, [8, 2, 4]) # [4, 2, 4] [0, -1, -1] + tmp_reshape_out = paddle.reshape(out9, [8, 4, 2]) + out10 = paddle.reshape(tmp_reshape_out, + [8, 8]) # [4, 8] [0, -1] + + # softmax + softmax = paddle.nn.Softmax() + out11 = softmax(out10) + error_cost = paddle.nn.functional.square_error_cost( + out11, label) + loss = paddle.mean(error_cost) + return main_program, start_program, loss + + main_program, dist_context = parallelizer(make_program, 0) + ops = main_program.global_block().ops + cluster = Cluster() + cluster.gen_default_config_cluster(device_count=2) + for idx, op in enumerate(ops): + dist_op = dist_context.get_dist_op_for_program(op) + op_dist_attr = dist_op.dist_attr + processes = op_dist_attr.process_mesh.processes + if is_elementwise_op(op.type): + container = get_distributed_operator_impl_container( + "elementwise") + else: + container = get_distributed_operator_impl_container( + op_dist_attr.impl_type) + + dist_impl = container.impls[op_dist_attr.impl_idx] + dist_op_cost = dist_impl.calc_cost(op.attr('op_role'), dist_op, + dist_context, cluster) + self.assertTrue(dist_op_cost) + + def test_dist_op_cost_part4(self): + + def make_program(): + main_program = paddle.static.Program() + start_program = paddle.static.Program() + with paddle.static.program_guard(main_program, start_program): + x = paddle.static.data(name='x', shape=[4], dtype='float32') + x.stop_gradient = True + label = paddle.static.data(name="label", + shape=[8, 1], + dtype='float32') + label.stop_gradient = True + auto.shard_tensor(x, + dist_attr={ + "process_mesh": auto.ProcessMesh([0, 1]), + "dims_mapping": [0] + }) + + auto.shard_tensor(label, + dist_attr={ + "process_mesh": auto.ProcessMesh([0, 1]), + "dims_mapping": [0, -1] + }) + # embedding + tmp = paddle.fluid.layers.fill_constant_batch_size_like( + input=x, shape=[4], value=1, dtype='int32') + embedding = paddle.nn.Embedding(10, 8) + out = embedding(tmp) + # row parallel embedding + for op in main_program.global_block().ops: + if op.type == "lookup_table_v2": + W = main_program.global_block().vars[op.input("W")[0]] + auto.shard_tensor(W, + dist_attr={ + "process_mesh": + auto.ProcessMesh([0, 1]), + "dims_mapping": [0, -1] + }) + out = paddle.fluid.layers.transpose(out, + [1, 0]) # [8, 2] [-1, 0] + + # mul + param1 = paddle.fluid.layers.create_parameter( + [4, 8], paddle.float32) # [2, 8] [0, -1] + auto.shard_tensor(param1, + dist_attr={ + "process_mesh": auto.ProcessMesh([0, 1]), + "dims_mapping": [0, -1] + }) + param2 = paddle.fluid.layers.create_parameter( + [8, 8], paddle.float32) # [8, 4] [-1, 0] + auto.shard_tensor(param2, + dist_attr={ + "process_mesh": auto.ProcessMesh([0, 1]), + "dims_mapping": [-1, 0] + }) + out1 = paddle.fluid.layers.mul(out, param1) # [8, 8] [-1, -1] + tmp_param = paddle.fluid.layers.create_parameter( + [8, 8], paddle.float32) # [8, 8] [-1, -1] + auto.shard_tensor(param2, + dist_attr={ + "process_mesh": auto.ProcessMesh([0, 1]), + "dims_mapping": [-1, -1] + }) + tmp_out = paddle.fluid.layers.mul(out1, tmp_param) + out2 = paddle.fluid.layers.mul(tmp_out, + param2) # [8, 4] [-1, 0] + + out8 = paddle.fluid.layers.transpose(out2, + [1, 0]) # [4, 8] [0, -1] + + # reshape + out9 = paddle.reshape(out8, [8, 2, 4]) # [4, 2, 4] [0, -1, -1] + tmp_reshape_out = paddle.reshape(out9, [8, 4, 2]) + out10 = paddle.reshape(tmp_reshape_out, + [8, 8]) # [4, 8] [0, -1] + + # softmax + softmax = paddle.nn.Softmax() + out11 = softmax(out10) + error_cost = paddle.nn.functional.square_error_cost( + out11, label) + loss = paddle.mean(error_cost) + return main_program, start_program, loss + + main_program, dist_context = parallelizer(make_program, 0) + ops = main_program.global_block().ops + cluster = Cluster() + cluster.gen_default_config_cluster(device_count=2) + for idx, op in enumerate(ops): + dist_op = dist_context.get_dist_op_for_program(op) + op_dist_attr = dist_op.dist_attr + processes = op_dist_attr.process_mesh.processes + if is_elementwise_op(op.type): + container = get_distributed_operator_impl_container( + "elementwise") + else: + container = get_distributed_operator_impl_container( + op_dist_attr.impl_type) + + dist_impl = container.impls[op_dist_attr.impl_idx] + dist_op_cost = dist_impl.calc_cost(op.attr('op_role'), dist_op, + dist_context, cluster) + self.assertTrue(dist_op_cost) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py b/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py index 9c5df9148c6ce..718ea255bb2bf 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py @@ -364,6 +364,35 @@ def init_data(self): ] +class TestLogPJVPAndTranspose(TestAddPJVPAndTranspose): + + def init_data(self): + # Set prim op + self.op_type = 'log_p' + X = paddle.static.data(name='X', shape=[5, 6], dtype='int64') + self.prim_input = { + 'X': X, + } + self.prim_output = { + 'Y': + self.layer_help.create_variable_for_type_inference(dtype=X.dtype) + } + self.prim_attrs = {} + + # Set JVP + X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64') + self.jvp_args = (X_DOT, ) + self.jvp_out_shape_map = {0: self.prim_output['Y']} + + self.all_ops = [ + # prim op: + 'log_p', + # jvp op: + 'div_p', + # transpose op: + ] + + class TestReshapePJVPAndTranspose(TestAddPJVPAndTranspose): def init_data(self): diff --git a/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py b/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py index 7557d2ba668c7..7745d1d59b3c4 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py @@ -208,6 +208,26 @@ def init_data(self): self.out_map = {0: self.output['Out']} +class TestLogOrig2Prim(TestElementWiseAddOrig2Prim): + + def init_data(self): + self.op_type = 'log' + X = paddle.static.data(name='X', shape=[3, 4], dtype='float') + + self.input = { + 'X': X, + } + self.output = { + 'Out': + self.layer_help.create_variable_for_type_inference(dtype=X.dtype) + } + self.attrs = {} + + self.orig2prim_args = (X, ) + self.all_ops = ['log', 'log_p'] + self.out_map = {0: self.output['Out']} + + class TestReshape2Orig2Prim(TestElementWiseAddOrig2Prim): def init_data(self): diff --git a/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py b/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py index 42c8cce0a8fe2..9ab5c563a515b 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py @@ -224,6 +224,26 @@ def init_data(self): self.out_map = {self.output['Y']: 0} +class TestLogPPrim2Orig(TestAddPPrim2Orig): + + def init_data(self): + self.op_type = 'log_p' + X = paddle.static.data(name='X', shape=[7, 8], dtype='float64') + + self.input = { + 'X': X, + } + self.output = { + 'Y': + self.layer_help.create_variable_for_type_inference(dtype=X.dtype) + } + self.attrs = {} + + self.prim2orig_args = (X, ) + self.all_ops = ['log_p', 'log'] + self.out_map = {self.output['Y']: 0} + + class TestReshapePPrim2Orig(TestAddPPrim2Orig): def init_data(self): diff --git a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py index 777c16a41e6c1..d6baf16a5b66f 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py @@ -146,6 +146,7 @@ def without_program_guard(): ('input_gradients_not_none', paddle.matmul, (np.random.rand(3, 3), np.random.rand(3, 3)), (np.random.rand(3, 3), np.random.rand(3, 3)), 'float64'), + ('log', paddle.log, (np.random.rand(3, 4), ), None, 'float32'), )) class TestForwardGrad(unittest.TestCase): @@ -254,6 +255,7 @@ def test_illegal_param(self): ('sin', paddle.sin, (np.random.rand(100, 200), ), None, 'float32'), ('cos', paddle.cos, (np.random.rand(200, 90), ), None, 'float32'), ('exp', paddle.exp, (np.random.rand(299, 320), ), None, 'float32'), + ('log', paddle.log, (np.random.rand(3, 4), ), None, 'float32'), )) class TestGrad(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/autograd/test_primops.py b/python/paddle/fluid/tests/unittests/autograd/test_primops.py index f95e6304b9a2a..00a30899a581f 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_primops.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_primops.py @@ -41,6 +41,7 @@ ('sin', primops.sin, randn(2, 3), {}, (2, 3), 'float64'), ('cos', primops.cos, randn(2, 3), {}, (2, 3), 'float64'), ('exp', primops.exp, randn(2, 3), {}, (2, 3), 'float64'), + ('log', primops.log, randn(2, 3), {}, (2, 3), 'float64'), ('reshape', primops.reshape, randn(2, 3), { 'shape': (3, 2) }, (3, 2), 'float64'), diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py index a33624ee5eedf..6702606ae980d 100644 --- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py +++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py @@ -236,7 +236,8 @@ def do_pyreader_training(self, fleet): fleet.save_inference_model(exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost) - self.check_model_right(model_dir) + if fleet.is_first_worker(): + self.check_model_right(model_dir) shutil.rmtree(model_dir) def do_dataset_training_queuedataset(self, fleet): @@ -277,7 +278,8 @@ def do_dataset_training_queuedataset(self, fleet): fleet.save_inference_model(exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost) - self.check_model_right(model_dir) + if fleet.is_first_worker(): + self.check_model_right(model_dir) shutil.rmtree(model_dir) dirname = os.getenv("SAVE_DIRNAME", None) @@ -327,16 +329,35 @@ def do_dataset_training(self, fleet): fleet.save_inference_model(exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost) - self.check_model_right(model_dir) + fleet.load_inference_model(model_dir, mode=0) + if fleet.is_first_worker(): + self.check_model_right(model_dir) shutil.rmtree(model_dir) dirname = os.getenv("SAVE_DIRNAME", None) if dirname: fleet.save_persistables(exe, dirname=dirname) + fleet.load_model(dirname, mode=0) cache_dirname = os.getenv("SAVE_CACHE_DIRNAME", None) if cache_dirname: fleet.save_cache_model(cache_dirname) + dense_param_dirname = os.getenv("SAVE_DENSE_PARAM_DIRNAME", None) + if dense_param_dirname: + fleet.save_dense_params(exe, dense_param_dirname, + fluid.global_scope(), + fluid.default_main_program()) + + save_one_table_dirname = os.getenv("SAVE_ONE_TABLE_DIRNAME", None) + if save_one_table_dirname: + fleet.save_one_table(0, save_one_table_dirname, 0) + fleet.load_one_table(0, save_one_table_dirname, 0) + + patch_dirname = os.getenv("SAVE_PATCH_DIRNAME", None) + if patch_dirname: + fleet.save_persistables(exe, patch_dirname, None, 5) + fleet.check_save_pre_patch_done() + if __name__ == "__main__": runtime_main(TestDistCTR2x2) diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py index 4ecad3e97c676..eee2ac9e1ab93 100644 --- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py +++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py @@ -91,7 +91,8 @@ def do_pyreader_training(self, fleet): fleet.save_inference_model(exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost) - self.check_model_right(model_dir) + if fleet.is_first_worker(): + self.check_model_right(model_dir) if fleet.is_first_worker(): fleet.save_persistables(executor=exe, dirname=model_dir) shutil.rmtree(model_dir) @@ -139,7 +140,8 @@ def do_dataset_training(self, fleet): fleet.save_inference_model(exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost) - self.check_model_right(model_dir) + if fleet.is_first_worker(): + self.check_model_right(model_dir) if fleet.is_first_worker(): fleet.save_persistables(executor=exe, dirname=model_dir) shutil.rmtree(model_dir) diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta_static.py index 83b66f5c2b217..6fa6b2769298b 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta_static.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta_static.py @@ -23,6 +23,7 @@ from config import ATOL, RTOL from parameterize import xrand +np.random.seed(2022) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py index 24c21d1bd45da..741fcab0438db 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py @@ -23,6 +23,8 @@ from test_distribution import DistributionNumpy +np.random.seed(2022) + class CategoricalNumpy(DistributionNumpy): diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_constraint.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_constraint.py index b927aef8e9b81..d061576c83644 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_constraint.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_constraint.py @@ -21,6 +21,8 @@ import config import parameterize as param +np.random.seed(2022) + @param.param_cls((param.TEST_CASE_NAME, 'value'), [('NotImplement', np.random.rand(2, 3))]) diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet.py index 8188b2231f294..9645598d96425 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet.py @@ -22,6 +22,8 @@ from config import ATOL, DEVICES, RTOL import parameterize as param +np.random.seed(2022) + @param.place(DEVICES) @param.param_cls( diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet_static.py index c4630bbd84b57..66103e4a91749 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet_static.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet_static.py @@ -21,6 +21,7 @@ from config import ATOL, DEVICES, RTOL from parameterize import TEST_CASE_NAME, parameterize_cls, place, xrand +np.random.seed(2022) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py index cc4b843091379..a1fea64f165c4 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py @@ -22,6 +22,8 @@ import mock_data as mock import parameterize +np.random.seed(2022) + @parameterize.place(config.DEVICES) @parameterize.parameterize_cls( diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily_static.py index 63f1fa81bf187..a2ebaca46a802 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily_static.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily_static.py @@ -22,6 +22,7 @@ import mock_data as mock import parameterize +np.random.seed(2022) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_independent.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_independent.py index 4f0639a0380a3..036d621f5c66b 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_independent.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_independent.py @@ -21,6 +21,8 @@ import config import parameterize as param +np.random.seed(2022) + @param.place(config.DEVICES) @param.param_cls( diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_independent_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_independent_static.py index e0196ecbf136e..950e137be1dd5 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_independent_static.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_independent_static.py @@ -21,6 +21,7 @@ import config import parameterize as param +np.random.seed(2022) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_normal.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_normal.py index 9e597c3d3635e..5023905caa744 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_normal.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_normal.py @@ -23,6 +23,8 @@ from test_distribution import DistributionNumpy +np.random.seed(2022) + class NormalNumpy(DistributionNumpy): diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_transform.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_transform.py index 8311a10f4d5f2..7f179ca572871 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_transform.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_transform.py @@ -22,6 +22,9 @@ import config import parameterize as param +np.random.seed(2022) +paddle.seed(2022) + @param.place(config.DEVICES) class TestTransform(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_transform_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_transform_static.py index 00a1f409dad52..0375bc229212a 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_transform_static.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_transform_static.py @@ -21,6 +21,8 @@ import config import parameterize as param +np.random.seed(2022) +paddle.seed(2022) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_uniform.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_uniform.py index 3fbb382a2403a..c592e53e9befc 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_uniform.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_uniform.py @@ -23,6 +23,8 @@ from test_distribution import DistributionNumpy +np.random.seed(2022) + class UniformNumpy(DistributionNumpy): diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_variable.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_variable.py index 94558395e0035..b0b8e8fd6fd6f 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_distribution_variable.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_variable.py @@ -22,6 +22,8 @@ import config import parameterize as param +paddle.seed(2022) + @param.param_cls( (param.TEST_CASE_NAME, 'is_discrete', 'event_rank', 'constraint'), diff --git a/python/paddle/fluid/tests/unittests/distribution/test_kl.py b/python/paddle/fluid/tests/unittests/distribution/test_kl.py index 0a957c540bed7..6c00a6a3273ed 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_kl.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_kl.py @@ -25,6 +25,8 @@ import mock_data as mock import parameterize as param +np.random.seed(2022) +paddle.seed(2022) paddle.set_default_dtype('float64') diff --git a/python/paddle/fluid/tests/unittests/distribution/test_kl_static.py b/python/paddle/fluid/tests/unittests/distribution/test_kl_static.py index 3bd62e1334bc0..b62a51c73663b 100644 --- a/python/paddle/fluid/tests/unittests/distribution/test_kl_static.py +++ b/python/paddle/fluid/tests/unittests/distribution/test_kl_static.py @@ -25,6 +25,8 @@ import parameterize as param import mock_data as mock +np.random.seed(2022) +paddle.seed(2022) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py index 34b485a8bd462..35be51213607b 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py @@ -61,7 +61,7 @@ def __reader__(): return __reader__ -def optimizer_setting(model, use_pure_fp16, opt_group=False): +def optimizer_setting(model, use_multi_precision, opt_group=False): clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) optimizer = paddle.optimizer.Momentum( parameters=[{ @@ -70,16 +70,23 @@ def optimizer_setting(model, use_pure_fp16, opt_group=False): learning_rate=0.001, weight_decay=0.00001, grad_clip=clip, - multi_precision=use_pure_fp16) + multi_precision=use_multi_precision) return optimizer -def train_mlp(model, shard_level, use_pure_fp16, output_dir): +def train_mlp(model, + shard_level, + use_multi_precision, + output_dir, + amp_level='O1'): group = paddle.distributed.new_group([0, 1]) - optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16) - model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32') + optimizer = optimizer_setting(model=model, + use_multi_precision=use_multi_precision) + model = paddle.amp.decorate(models=model, + level=amp_level, + save_dtype='float32') scaler = paddle.amp.GradScaler(init_loss_scaling=32768) model, optimizer, scaler = group_sharded_parallel(model=model, @@ -104,13 +111,13 @@ def train_mlp(model, shard_level, use_pure_fp16, output_dir): img, label = data label.stop_gradient = True img.stop_gradient = True - with paddle.amp.auto_cast(True, level='O2'): + with paddle.amp.auto_cast(True, level=amp_level): out = model(img) loss = paddle.nn.functional.cross_entropy(input=out, label=label) avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) - if not use_pure_fp16: + if not use_multi_precision: avg_loss.backward() optimizer.step() else: @@ -135,12 +142,36 @@ def test_sharding_api(): # fp16 stage2_params = train_mlp(mlp1, shard_level="os_g", - use_pure_fp16=True, - output_dir=output_dir) + use_multi_precision=True, + output_dir=output_dir, + amp_level='O2') stage3_params = train_mlp(mlp2, shard_level="p_g_os", - use_pure_fp16=True, - output_dir=output_dir) + use_multi_precision=True, + output_dir=output_dir, + amp_level='O2') + + for i in range(len(stage3_params)): + np.testing.assert_allclose(stage2_params[i].numpy(), + stage3_params[i].numpy(), + rtol=1e-4, + atol=1e-3) + + # AMP + mlp3, mlp4 = MLP(), MLP() + mlp3.set_state_dict(state_dict) + mlp4.set_state_dict(state_dict) + + stage2_params = train_mlp(mlp3, + shard_level="os_g", + use_multi_precision=True, + output_dir=output_dir, + amp_level='O1') + stage3_params = train_mlp(mlp4, + shard_level="p_g_os", + use_multi_precision=True, + output_dir=output_dir, + amp_level='O1') for i in range(len(stage3_params)): np.testing.assert_allclose(stage2_params[i].numpy(), diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py index 8f6dadb5ce978..5de9b5ecea084 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py @@ -61,7 +61,7 @@ def __reader__(): return __reader__ -def optimizer_setting(model, use_pure_fp16, opt_group=False): +def optimizer_setting(model, use_multi_precision, opt_group=False): clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) optimizer = paddle.optimizer.Momentum( parameters=[{ @@ -70,14 +70,21 @@ def optimizer_setting(model, use_pure_fp16, opt_group=False): learning_rate=0.001, weight_decay=0.00001, grad_clip=clip, - multi_precision=use_pure_fp16) + multi_precision=use_multi_precision) return optimizer -def train_mlp(model, shard_level, use_pure_fp16, output_dir): - optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16) - model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32') +def train_mlp(model, + shard_level, + use_multi_precision, + output_dir, + amp_level='O1'): + optimizer = optimizer_setting(model=model, + use_multi_precision=use_multi_precision) + model = paddle.amp.decorate(models=model, + level=amp_level, + save_dtype='float32') scaler = paddle.amp.GradScaler(init_loss_scaling=32768) model, optimizer, scaler = group_sharded_parallel(model=model, @@ -102,13 +109,13 @@ def train_mlp(model, shard_level, use_pure_fp16, output_dir): img, label = data label.stop_gradient = True img.stop_gradient = True - with paddle.amp.auto_cast(True, level='O2'): + with paddle.amp.auto_cast(True, level=amp_level): out = model(img) loss = paddle.nn.functional.cross_entropy(input=out, label=label) avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) - if not use_pure_fp16: + if not use_multi_precision: avg_loss.backward() optimizer.step() else: @@ -134,19 +141,36 @@ def test_sharding_api(): # fp16 stage2_params = train_mlp(mlp1, shard_level="os_g", - use_pure_fp16=True, - output_dir=output_dir) + use_multi_precision=True, + output_dir=output_dir, + amp_level='O2') stage3_params = train_mlp(mlp2, shard_level="p_g_os", - use_pure_fp16=True, - output_dir=output_dir) + use_multi_precision=True, + output_dir=output_dir, + amp_level='O2') for i in range(len(stage3_params)): np.testing.assert_allclose(stage2_params[i].numpy(), stage3_params[i].numpy(), rtol=1e-4, atol=1e-3) - shutil.rmtree(output_dir) + + # AMP + mlp3, mlp4 = MLP(), MLP() + mlp3.set_state_dict(state_dict) + mlp4.set_state_dict(state_dict) + + stage2_params = train_mlp(mlp3, + shard_level="os_g", + use_multi_precision=True, + output_dir=output_dir, + amp_level='O1') + stage3_params = train_mlp(mlp4, + shard_level="p_g_os", + use_multi_precision=True, + output_dir=output_dir, + amp_level='O1') if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py index b5ccf735ce229..65aec5ca8dd0e 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py @@ -164,7 +164,7 @@ def test_tensor_shape(self): net = ShapeLayer() out = net(x) - self.assertTrue(np.array_equal(out.numpy(), x.numpy())) + np.testing.assert_array_equal(out.numpy(), x.numpy()) class TestIfElseNoValue(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_deepcopy.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_deepcopy.py index dcc12e120d689..ecb2d97fa4482 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_deepcopy.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_deepcopy.py @@ -36,7 +36,7 @@ def test_net(self): self.assertFalse(isinstance(net.forward, StaticFunction)) self.assertTrue(id(copy_net), id(copy_net.forward.__self__)) - self.assertTrue(np.array_equal(src_out.numpy(), copy_out.numpy())) + np.testing.assert_array_equal(src_out.numpy(), copy_out.numpy()) def test_func(self): st_foo = paddle.jit.to_static(foo) @@ -48,7 +48,7 @@ def test_func(self): new_foo = deepcopy(st_foo) self.assertFalse(isinstance(new_foo, StaticFunction)) new_out = new_foo(x) - self.assertTrue(np.array_equal(st_out.numpy(), new_out.numpy())) + np.testing.assert_array_equal(st_out.numpy(), new_out.numpy()) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py index 7d980b5f75a62..27d7389b903cc 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py @@ -443,6 +443,20 @@ def test_error(self): for disable_new_error in [0, 1]: self._test_raise_new_exception(disable_new_error) +@paddle.jit.to_static +def func_ker_error(x): + d = { + 'x': x + } + y = d['y'] + x + return y + +class TestKeyError(unittest.TestCase): + def test_key_error(self): + paddle.disable_static() + with self.assertRaises(error.Dy2StKeyError): + x = paddle.to_tensor([1]) + func_ker_error(x) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_gradient_aggregation.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_gradient_aggregation.py new file mode 100644 index 0000000000000..3b7cca31ce989 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_gradient_aggregation.py @@ -0,0 +1,60 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import paddle +import numpy as np + +SEED = 2020 +np.random.seed(SEED) + + +class SimpleNet(paddle.nn.Layer): + + def __init__(self): + super(SimpleNet, self).__init__() + self.linear1 = paddle.nn.Linear(10, 3) + self.linear2 = paddle.nn.Linear(3, 1) + + def forward(self, x): + out1 = self.linear1(x) + out2 = self.linear2(out1) + return [out1, out2] # 梯度为0 + #return [out1] # 梯度正常 + #return [out2, out1] # 梯度正常 + + +class TestGradientAggregationInDy2Static(unittest.TestCase): + + def test_to_static(self): + + def simplenet_grad(inp, to_static=False): + net = SimpleNet() + if to_static: net = paddle.jit.to_static(net) + loss = net(inp) + loss[0].backward() + return net.linear1.weight.grad + + inp = paddle.to_tensor(np.random.randn(10, )).astype("float32") + self.assertTrue( + np.allclose( + simplenet_grad(inp, True).numpy(), + simplenet_grad(inp, False).numpy())) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py index 8ecae3c6b8d3a..560ae6b4adefc 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py @@ -201,7 +201,7 @@ def test_prune(self): model.eval() input_ids = paddle.to_tensor(input_ids) out = model(input_ids) - self.assertTrue(np.array_equal(out.numpy(), [[15, 11]])) + np.testing.assert_array_equal(out.numpy(), [[15, 11]]) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_rollback.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_rollback.py index 5277a50c299ea..f949e9c0da78d 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_rollback.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_rollback.py @@ -82,7 +82,7 @@ def test_plain_func(self): dy_out = st_foo(x) self.assertTrue(func_to_source_code(foo) == func_to_source_code(st_foo)) - self.assertTrue(np.array_equal(st_out.numpy(), dy_out.numpy())) + np.testing.assert_array_equal(st_out.numpy(), dy_out.numpy()) class TestRollBackNet(unittest.TestCase): @@ -111,15 +111,15 @@ def test_net(self): self.assertFalse(isinstance(net.forward, StaticFunction)) self.assertFalse("true_fn" in func_to_source_code(net.sub.forward)) dy_fwd_out = net(x) - self.assertTrue(np.array_equal(st_fwd_out.numpy(), dy_fwd_out.numpy())) + np.testing.assert_array_equal(st_fwd_out.numpy(), dy_fwd_out.numpy()) # rollback infer into original dygraph method net.infer.rollback() self.assertFalse(isinstance(net.infer, StaticFunction)) self.assertFalse("true_fn" in func_to_source_code(net.sub.forward)) dy_infer_out = net.infer(x) - self.assertTrue( - np.array_equal(st_infer_out.numpy(), dy_infer_out.numpy())) + np.testing.assert_array_equal(st_infer_out.numpy(), + dy_infer_out.numpy()) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py index 48dc33cc6c786..7e8390d54430b 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py @@ -208,8 +208,8 @@ def test_static_slice_step(self): out = exe.run(prog, feed={'x': array}, fetch_list=[z1, z2]) - self.assertTrue(np.array_equal(out[0], array[::2])) - self.assertTrue(np.array_equal(out[1], array[::-2])) + np.testing.assert_array_equal(out[0], array[::2]) + np.testing.assert_array_equal(out[1], array[::-2]) def test_static_slice_step_dygraph2static(self): paddle.disable_static() @@ -225,10 +225,10 @@ def func(inps): input_spec=[InputSpec(shape=[None, 4, 4])]) static_result = sfunc(inps) - self.assertTrue( - np.array_equal(origin_result[0].numpy(), static_result[0].numpy())) - self.assertTrue( - np.array_equal(origin_result[1].numpy(), static_result[1].numpy())) + np.testing.assert_array_equal(origin_result[0].numpy(), + static_result[0].numpy()) + np.testing.assert_array_equal(origin_result[1].numpy(), + static_result[1].numpy()) class TestPaddleStridedSlice(unittest.TestCase): @@ -268,10 +268,8 @@ def test_compare_paddle_strided_slice_with_numpy(self): ends=e2, strides=stride2) - self.assertTrue( - np.array_equal( - sl.numpy(), array[s2[0]:e2[0]:stride2[0], - s2[1]:e2[1]:stride2[1]])) + np.testing.assert_array_equal( + sl.numpy(), array[s2[0]:e2[0]:stride2[0], s2[1]:e2[1]:stride2[1]]) array = np.arange(6 * 7 * 8).reshape((6, 7, 8)) pt = paddle.to_tensor(array) @@ -285,9 +283,10 @@ def test_compare_paddle_strided_slice_with_numpy(self): strides=stride2) array_slice = array[s2[0]:e2[0]:stride2[0], ::, s2[1]:e2[1]:stride2[1]] - self.assertTrue( - np.array_equal(sl.numpy(), array_slice), - msg="paddle.strided_slice:\n {} \n numpy slice:\n{}".format( + np.testing.assert_array_equal( + sl.numpy(), + array_slice, + err_msg='paddle.strided_slice:\n {} \n numpy slice:\n{}'.format( sl.numpy(), array_slice)) diff --git a/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py b/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py index 3c48c99af34b5..756dd7889977b 100644 --- a/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py +++ b/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py @@ -12,20 +12,43 @@ # See the License for the specific language governing permissions and # limitations under the License. +import enum import numpy as np from functools import partial from numpy import asarray from numpy.fft._pocketfft import _raw_fft, _raw_fftnd, _get_forward_norm, _get_backward_norm, _cook_nd_args +class NormMode(enum.Enum): + none = 1 + by_sqrt_n = 2 + by_n = 3 + + +def _get_norm_mode(norm, forward): + if norm == "ortho": + return NormMode.by_sqrt_n + if norm is None or norm == "backward": + return NormMode.none if forward else NormMode.by_n + return NormMode.by_n if forward else NormMode.none + + +def _get_inv_norm(n, norm_mode): + assert isinstance(norm_mode, + NormMode), "invalid norm_type {}".format(norm_mode) + if norm_mode == NormMode.none: + return 1.0 + if norm_mode == NormMode.by_sqrt_n: + return np.sqrt(n) + return n + + +# 1d transforms def _fftc2c(a, n=None, axis=-1, norm=None, forward=None): a = asarray(a) if n is None: n = a.shape[axis] - if forward: - inv_norm = _get_forward_norm(n, norm) - else: - inv_norm = _get_backward_norm(n, norm) + inv_norm = _get_inv_norm(n, norm) output = _raw_fft(a, n, axis, False, forward, inv_norm) return output @@ -34,10 +57,7 @@ def _fftr2c(a, n=None, axis=-1, norm=None, forward=None): a = asarray(a) if n is None: n = a.shape[axis] - if forward: - inv_norm = _get_forward_norm(n, norm) - else: - inv_norm = _get_backward_norm(n, norm) + inv_norm = _get_inv_norm(n, norm) output = _raw_fft(a, n, axis, True, True, inv_norm) if not forward: output = output.conj() @@ -48,43 +68,67 @@ def _fftc2r(a, n=None, axis=-1, norm=None, forward=None): a = asarray(a) if n is None: n = (a.shape[axis] - 1) * 2 - if forward: - inv_norm = _get_forward_norm(n, norm) - else: - inv_norm = _get_backward_norm(n, norm) + inv_norm = _get_inv_norm(n, norm) output = _raw_fft(a.conj() if forward else a, n, axis, True, False, inv_norm) return output -def fft_c2c(x, axes, normalization, forward): +# general fft functors +def _fft_c2c_nd(x, axes, norm_mode, forward): f = partial(_fftc2c, forward=forward) - y = _raw_fftnd(x, s=None, axes=axes, function=f, norm=normalization) + y = _raw_fftnd(x, s=None, axes=axes, function=f, norm=norm_mode) return y -def fft_c2c_backward(dy, axes, normalization, forward): - f = partial(_fftc2c, forward=forward) - dx = _raw_fftnd(dy, s=None, axes=axes, function=f, norm=normalization) - return dx - - -def fft_r2c(x, axes, normalization, forward, onesided): +def _fft_r2c_nd(x, axes, norm_mode, forward, onesided): a = asarray(x) s, axes = _cook_nd_args(a, axes=axes) if onesided: - a = _fftr2c(a, s[-1], axes[-1], normalization, forward) - for ii in range(len(axes) - 1): - a = _fftc2c(a, s[ii], axes[ii], normalization, forward) + a = _fftr2c(a, s[-1], axes[-1], norm_mode, forward) + a = _fft_c2c_nd(a, axes[:-1], norm_mode, forward) else: - a = fft_c2c(x, axes, normalization, forward) + a = _fft_c2c_nd(x, axes, norm_mode, forward) + return a + + +def _fft_c2r_nd(x, axes, norm_mode, forward, last_dim_size): + a = asarray(x) + s, axes = _cook_nd_args(a, axes=axes, invreal=1) + if last_dim_size is not None: + s[-1] = last_dim_size + a = _fft_c2c_nd(a, axes[:-1], norm_mode, forward) + a = _fftc2r(a, s[-1], axes[-1], norm_mode, forward) return a -def fft_r2c_backward(dy, x, axes, normalization, forward, onesided): +# kernels +def fft_c2c(x, axes, normalization, forward): + norm_mode = _get_norm_mode(normalization, forward) + return _fft_c2c_nd(x, axes, norm_mode, forward) + + +def fft_c2r(x, axes, normalization, forward, last_dim_size): + norm_mode = _get_norm_mode(normalization, forward) + return _fft_c2r_nd(x, axes, norm_mode, forward, last_dim_size) + + +def fft_r2c(x, axes, normalization, forward, onesided): + norm_mode = _get_norm_mode(normalization, forward) + return _fft_r2c_nd(x, axes, norm_mode, forward, onesided) + + +# backward kernel +def fft_c2c_backward(dy, axes, normalization, forward): + norm_mode = _get_norm_mode(normalization, forward) + dx = _fft_c2c_nd(dy, axes, norm_mode, not forward) + return dx + + +def fft_r2c_backward(x, dy, axes, normalization, forward, onesided): a = dy if not onesided: - a = fft_c2c_backward(a, axes, normalization, forward).real + a = fft_c2c_backward(a, axes, normalization, forward) else: pad_widths = [(0, 0)] * a.ndim last_axis = axes[-1] @@ -93,16 +137,25 @@ def fft_r2c_backward(dy, x, axes, normalization, forward, onesided): last_dim_size = a.shape[last_axis] pad_widths[last_axis] = (0, x.shape[last_axis] - last_dim_size) a = np.pad(a, pad_width=pad_widths) - a = fft_c2c_backward(a, axes, normalization, forward).real - return a + a = fft_c2c_backward(a, axes, normalization, forward) + return a.real -def fft_c2r(x, axes, normalization, forward, last_dim_size): - a = asarray(x) - s, axes = _cook_nd_args(a, axes=axes, invreal=1) - if last_dim_size is not None: - s[-1] = last_dim_size - for ii in range(len(axes) - 1): - a = _fftc2c(a, s[ii], axes[ii], normalization, forward) - a = _fftc2r(a, s[-1], axes[-1], normalization, forward) +def _fft_fill_conj_grad(x, axes, length_to_double): + last_fft_axis = axes[-1] + shape = x.shape + for multi_index in np.ndindex(*shape): + if 0 < multi_index[last_fft_axis] and multi_index[ + last_fft_axis] <= length_to_double: + x[multi_index] *= 2 + return x + + +def fft_c2r_backward(x, dy, axes, normalization, forward, last_dim_size): + norm_mode = _get_norm_mode(normalization, forward) + a = dy + a = _fft_r2c_nd(dy, axes, norm_mode, not forward, True) + last_fft_axis = axes[-1] + length_to_double = dy.shape[last_fft_axis] - x.shape[last_fft_axis] + a = _fft_fill_conj_grad(a, axes, length_to_double) return a diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft.py b/python/paddle/fluid/tests/unittests/fft/test_fft.py index f386fdc9c3460..f7cc9fbf4a130 100644 --- a/python/paddle/fluid/tests/unittests/fft/test_fft.py +++ b/python/paddle/fluid/tests/unittests/fft/test_fft.py @@ -473,7 +473,7 @@ def test_irfft2(self): @parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [ ('test_bool_input', (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype( - np.bool_), None, -1, 'backward', NotImplementedError), + np.bool_), None, -1, 'backward', RuntimeError), ('test_n_nagative', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), -1, -1, 'backward', ValueError), ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1, @@ -543,7 +543,7 @@ def test_irfft(self): (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [('test_bool_input', (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype( - np.bool_), None, (-2, -1), 'backward', NotImplementedError), + np.bool_), None, (-2, -1), 'backward', RuntimeError), ('test_n_nagative', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2), (-2, -1), 'backward', ValueError), @@ -625,7 +625,7 @@ def test_irfft2(self): (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [('test_bool_input', (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype( - np.bool_), None, (-2, -1), 'backward', NotImplementedError), + np.bool_), None, (-2, -1), 'backward', RuntimeError), ('test_n_nagative', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2), (-2, -1), 'backward', ValueError), diff --git a/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py b/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py index ba4092965920b..066869750ed07 100644 --- a/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py +++ b/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py @@ -20,12 +20,13 @@ import re import sys -from spectral_op_np import fft_c2c, fft_r2c, fft_c2r +from spectral_op_np import fft_c2c, fft_r2c, fft_c2r, fft_c2c_backward, fft_r2c_backward, fft_c2r_backward import paddle.fluid.core as core import paddle.fluid.dygraph as dg import paddle.static as static from numpy.random import random as rand from paddle.fluid import Program, program_guard +from paddle import _C_ops sys.path.append("../") from op_test import OpTest @@ -73,14 +74,26 @@ def class_name(cls, num, params_dict): return "{}_{}{}".format(cls.__name__, num, suffix and "_" + suffix) +def fft_c2c_python_api(x, axes, norm, forward): + return _C_ops.final_state_fft_c2c(x, axes, norm, forward) + + +def fft_r2c_python_api(x, axes, norm, forward, onesided): + return _C_ops.final_state_fft_r2c(x, axes, norm, forward, onesided) + + +def fft_c2r_python_api(x, axes, norm, forward, last_dim_size=0): + return _C_ops.final_state_fft_c2r(x, axes, norm, forward, last_dim_size) + + @parameterize( (TEST_CASE_NAME, 'x', 'axes', 'norm', 'forward'), [('test_axes_is_sqe_type', (np.random.random( (12, 14)) + 1j * np.random.random( (12, 14))).astype(np.complex128), [0, 1], 'forward', True), ('test_axis_not_last', (np.random.random( - (4, 4, 4)) + 1j * np.random.random( - (4, 4, 4))).astype(np.complex128), (0, 1), "backward", False), + (4, 8, 4)) + 1j * np.random.random( + (4, 8, 4))).astype(np.complex128), (0, 1), "backward", False), ('test_norm_forward', (np.random.random((12, 14)) + 1j * np.random.random( (12, 14))).astype(np.complex128), (0, ), "forward", False), ('test_norm_backward', (np.random.random((12, 14)) + 1j * np.random.random( @@ -88,11 +101,11 @@ def class_name(cls, num, params_dict): ('test_norm_ortho', (np.random.random((12, 14)) + 1j * np.random.random( (12, 14))).astype(np.complex128), (1, ), "ortho", True)]) class TestFFTC2COp(OpTest): - # Because framwork not support complex numerial gradient, we skip gradient check. - no_need_check_grad = True def setUp(self): self.op_type = "fft_c2c" + self.dtype = self.x.dtype + self.python_api = fft_c2c_python_api out = fft_c2c(self.x, self.axes, self.norm, self.forward) @@ -104,8 +117,21 @@ def setUp(self): } self.outputs = {'Out': out} + self.out_grad = (np.random.random(self.x.shape) + + 1j * np.random.random(self.x.shape)).astype( + self.x.dtype) + self.x_grad = fft_c2c_backward(self.out_grad, self.axes, self.norm, + self.forward) + def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) + + def test_check_grad(self): + self.check_grad("X", + "Out", + user_defined_grads=[self.x_grad], + user_defined_grad_outputs=[self.out_grad], + check_eager=True) @parameterize( @@ -114,7 +140,7 @@ def test_check_output(self): (12, 14)) + 1j * np.random.random( (12, 14))).astype(np.complex128), [0, 1], 'forward', True, 26), ('test_axis_not_last', (np.random.random( - (4, 4, 4)) + 1j * np.random.random((4, 4, 4))).astype(np.complex128), + (4, 7, 4)) + 1j * np.random.random((4, 7, 4))).astype(np.complex128), (0, 1), "backward", False, None), ('test_norm_forward', (np.random.random((12, 14)) + 1j * np.random.random( (12, 14))).astype(np.complex128), (0, ), "forward", False, 22), @@ -123,11 +149,11 @@ def test_check_output(self): ('test_norm_ortho', (np.random.random((12, 14)) + 1j * np.random.random( (12, 14))).astype(np.complex128), (1, ), "ortho", True, 26)]) class TestFFTC2ROp(OpTest): - # Because framwork not support complex numerial gradient, we skip gradient check. - no_need_check_grad = True def setUp(self): self.op_type = "fft_c2r" + self.dtype = self.x.dtype + self.python_api = fft_c2r_python_api out = fft_c2r(self.x, self.axes, self.norm, self.forward, self.last_dim_size) @@ -141,28 +167,40 @@ def setUp(self): } self.outputs = {'Out': out} + self.out_grad = np.random.random(out.shape).astype(out.dtype) + self.x_grad = fft_c2r_backward(self.x, self.out_grad, self.axes, + self.norm, self.forward, + self.last_dim_size) + def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) + + def test_check_grad(self): + self.check_grad(["X"], + "Out", + user_defined_grads=[self.x_grad], + user_defined_grad_outputs=[self.out_grad], + check_eager=True) @parameterize( (TEST_CASE_NAME, 'x', 'axes', 'norm', 'forward', 'onesided'), - [('test_axes_is_sqe_type', np.random.randn(12, 14).astype(np.float64), + [('test_axes_is_sqe_type', np.random.randn(12, 18).astype(np.float64), (0, 1), 'forward', True, True), - ('test_axis_not_last', np.random.randn(4, 4, 4).astype(np.float64), + ('test_axis_not_last', np.random.randn(4, 8, 4).astype(np.float64), (0, 1), "backward", False, True), - ('test_norm_forward', np.random.randn(12, 14).astype(np.float64), + ('test_norm_forward', np.random.randn(12, 18).astype(np.float64), (0, 1), "forward", False, False), - ('test_norm_backward', np.random.randn(12, 14).astype(np.float64), + ('test_norm_backward', np.random.randn(12, 18).astype(np.float64), (0, ), "backward", True, False), - ('test_norm_ortho', np.random.randn(12, 14).astype(np.float64), + ('test_norm_ortho', np.random.randn(12, 18).astype(np.float64), (1, ), "ortho", True, False)]) class TestFFTR2COp(OpTest): - # Because framwork not support complex numerial gradient, we skip gradient check. - no_need_check_grad = True def setUp(self): self.op_type = "fft_r2c" + self.dtype = self.x.dtype + self.python_api = fft_r2c_python_api out = fft_r2c(self.x, self.axes, self.norm, self.forward, self.onesided) @@ -175,5 +213,16 @@ def setUp(self): } self.outputs = {'Out': out} + self.out_grad = np.random.random(out.shape).astype(out.dtype) + self.x_grad = fft_r2c_backward(self.x, self.out_grad, self.axes, + self.norm, self.forward, self.onesided) + def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) + + def test_check_grad(self): + self.check_grad("X", + "Out", + user_defined_grads=[self.x_grad], + user_defined_grad_outputs=[self.out_grad], + check_eager=True) diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py index aa0290cf4b5fa..ef56e087f50e9 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py +++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py @@ -97,9 +97,9 @@ def test_with_feed(self): for x, y in zip(gt, res): if isinstance(x, list): for tx, ty in zip(x, y): - self.assertTrue(np.array_equal(tx, ty)) + np.testing.assert_array_equal(tx, ty) elif isinstance(x, np.ndarray): - self.assertTrue(np.array_equal(tx, ty)) + np.testing.assert_array_equal(tx, ty) else: raise Exception("Not Implement!") diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py index ad13061d17802..75741f90aeee6 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py +++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py @@ -261,7 +261,7 @@ def test_with_feed(self): res = self.run_new_executor(feed) gt = self.run_raw_executor(feed) for x, y in zip(gt, res): - self.assertTrue(np.array_equal(x, y)) + np.testing.assert_array_equal(x, y) def test_with_error(self): feed = [{'a': np.ones([2, 2], dtype="float32")}] @@ -277,7 +277,7 @@ def test_compiled_program(self): res = self.run_new_executor(feed, use_compiled=True) gt = self.run_raw_executor(feed, use_compiled=True) for x, y in zip(gt, res): - self.assertTrue(np.array_equal(x, y)) + np.testing.assert_array_equal(x, y) def test_compiled_program_convert_graph_to_program(self): data = np.ones([2, 2], dtype="float32") @@ -286,7 +286,7 @@ def test_compiled_program_convert_graph_to_program(self): res = self.run_new_executor(feed, use_compiled=True) gt = self.run_raw_executor(feed, use_compiled=True) for x, y in zip(gt, res): - self.assertTrue(np.array_equal(x, y)) + np.testing.assert_array_equal(x, y) def test_empty_program(self): program = paddle.static.Program() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py new file mode 100644 index 0000000000000..a27ed9dd9c99a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_activation_fuse_pass.py @@ -0,0 +1,132 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_scan_test import PassAutoScanTest +from program_config import TensorConfig, ProgramConfig, OpConfig +import numpy as np +from functools import partial +import unittest + +import hypothesis.strategies as st + + +class TestMatmulElementwiseAddActivationMkldnnFusePass(PassAutoScanTest): + + def sample_program_config(self, draw): + axis = draw(st.sampled_from([-1, 0, 1])) + matmul_as_x = draw(st.booleans()) + batch_size = draw(st.integers(min_value=2, max_value=4)) + channel = draw(st.sampled_from([16, 32, 64])) + input_dim = draw(st.sampled_from([16, 32, 64])) + activation_type = draw( + st.sampled_from([ + 'relu', 'gelu', 'tanh', 'sigmoid', 'swish', 'mish', 'sqrt', + 'hard_swish', 'sigmoid', 'abs', 'relu6', 'clip', 'tanh', + 'hard_sigmoid', 'leaky_relu' + ])) + + def generate_input(): + return np.random.random([batch_size, channel, input_dim, + input_dim]).astype(np.float32) + + matmul_op = OpConfig(type='matmul', + inputs={ + 'X': ['matmul_x'], + 'Y': ['matmul_y'] + }, + outputs={'Out': ['matmul_output']}, + attrs={ + 'use_mkldnn': True, + }) + + if matmul_as_x: + inputs = {'X': ['matmul_output'], 'Y': ['elementwise_addend']} + else: + inputs = {'X': ['elementwise_addend'], 'Y': ['matmul_output']} + + elt_add_op = OpConfig(type='elementwise_add', + inputs=inputs, + outputs={'Out': ['elementwise_add_output']}, + attrs={ + 'axis': axis, + 'use_mkldnn': True + }) + + if activation_type == "relu6": + activation_op = OpConfig(activation_type, + inputs={"X": ["elementwise_add_output"]}, + outputs={"Out": ["activation_output"]}, + threshold=draw( + st.floats(min_value=1.0, + max_value=10.0))) + elif activation_type == "leaky_relu": + activation_op = OpConfig(activation_type, + inputs={"X": ["elementwise_add_output"]}, + outputs={"Out": ["activation_output"]}, + alpha=draw( + st.floats(min_value=0.1, + max_value=1.0))) + elif activation_type == "swish": + activation_op = OpConfig(activation_type, + inputs={"X": ["elementwise_add_output"]}, + outputs={"Out": ["activation_output"]}, + beta=draw( + st.floats(min_value=0.1, + max_value=1.0))) + elif activation_type == "clip": + activation_op = OpConfig( + activation_type, + inputs={"X": ["elementwise_add_output"]}, + outputs={"Out": ["activation_output"]}, + min=draw(st.floats(min_value=0.1, max_value=0.49)), + max=draw(st.floats(min_value=0.5, max_value=1.0))) + else: + activation_op = OpConfig(activation_type, + inputs={"X": ["elementwise_add_output"]}, + outputs={"Out": ["activation_output"]}) + + model_net = [matmul_op, elt_add_op, activation_op] + + program_config = ProgramConfig( + ops=model_net, + weights={}, + inputs={ + 'matmul_x': TensorConfig(data_gen=partial(generate_input)), + 'matmul_y': TensorConfig(data_gen=partial(generate_input)), + 'elementwise_addend': + TensorConfig(data_gen=partial(generate_input)) + }, + outputs=['activation_output']) + + return program_config + + def sample_predictor_configs(self, program_config): + config = self.create_inference_config( + use_mkldnn=True, + passes=[ + 'matmul_elementwise_add_mkldnn_fuse_pass', + 'matmul_activation_mkldnn_fuse_pass' + ]) + yield config, ['matmul'], (1e-5, 1e-5) + + def test(self): + self.run_and_statis(quant=False, + passes=[ + 'matmul_elementwise_add_mkldnn_fuse_pass', + 'matmul_activation_mkldnn_fuse_pass' + ]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py new file mode 100644 index 0000000000000..38c8985dbad1f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_elementwise_add_fuse_pass.py @@ -0,0 +1,86 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_scan_test import PassAutoScanTest +from program_config import TensorConfig, ProgramConfig, OpConfig +import numpy as np +from functools import partial +import unittest + +import hypothesis.strategies as st + + +class TestMatmulElementwiseAddMkldnnFusePass(PassAutoScanTest): + + def sample_program_config(self, draw): + axis = draw(st.sampled_from([-1, 0, 1])) + matmul_as_x = draw(st.booleans()) + batch_size = draw(st.integers(min_value=2, max_value=4)) + channel = draw(st.sampled_from([16, 32, 64])) + input_dim = draw(st.sampled_from([16, 32, 64])) + + def generate_input(): + return np.random.random([batch_size, channel, input_dim, + input_dim]).astype(np.float32) + + matmul_op = OpConfig(type='matmul', + inputs={ + 'X': ['matmul_x'], + 'Y': ['matmul_y'] + }, + outputs={'Out': ['matmul_output']}, + attrs={ + 'use_mkldnn': True, + }) + + if matmul_as_x: + inputs = {'X': ['matmul_output'], 'Y': ['elementwise_addend']} + else: + inputs = {'X': ['elementwise_addend'], 'Y': ['matmul_output']} + + elt_add_op = OpConfig(type='elementwise_add', + inputs=inputs, + outputs={'Out': ['elementwise_add_output']}, + attrs={ + 'axis': axis, + 'use_mkldnn': True + }) + + model_net = [matmul_op, elt_add_op] + + program_config = ProgramConfig( + ops=model_net, + weights={}, + inputs={ + 'matmul_x': TensorConfig(data_gen=partial(generate_input)), + 'matmul_y': TensorConfig(data_gen=partial(generate_input)), + 'elementwise_addend': + TensorConfig(data_gen=partial(generate_input)) + }, + outputs=['elementwise_add_output']) + + return program_config + + def sample_predictor_configs(self, program_config): + config = self.create_inference_config( + use_mkldnn=True, passes=['matmul_elementwise_add_mkldnn_fuse_pass']) + yield config, ['matmul'], (1e-5, 1e-5) + + def test(self): + self.run_and_statis(quant=False, + passes=['matmul_elementwise_add_mkldnn_fuse_pass']) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py index a5471eca2c26e..c8fb49c10c123 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py @@ -121,8 +121,8 @@ def sample_predictor_configs(self, program_config): yield config, ["matmul"], (1e-5, 1e-5) def test(self): - self.run_and_statis(quant=False, - passes=["matmul_transpose_reshape_fuse_pass"]) + self.run_and_statis( + quant=False, passes=["matmul_transpose_reshape_mkldnn_fuse_pass"]) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py new file mode 100644 index 0000000000000..2858d7f2d4e33 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py @@ -0,0 +1,131 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_scan_test import PassAutoScanTest +from program_config import TensorConfig, ProgramConfig, OpConfig +import numpy as np +from functools import partial +import unittest +import hypothesis.strategies as st + + +class TestMatmulv2ActivationMkldnnFusePass(PassAutoScanTest): + + def sample_program_config(self, draw): + transpose_X = draw(st.booleans()) + transpose_Y = draw(st.booleans()) + batch_size = draw(st.integers(min_value=2, max_value=4)) + channel = draw(st.sampled_from([16, 32, 64])) + input_dim = draw(st.sampled_from([16, 32, 64])) + activation_type = draw( + st.sampled_from([ + 'relu', 'gelu', 'tanh', 'sigmoid', 'swish', 'mish', 'sqrt', + 'hard_swish', 'sigmoid', 'abs', 'relu6', 'clip', 'tanh', + 'hard_sigmoid', 'leaky_relu' + ])) + + def generate_input(type): + broadcast_X = st.booleans() + channel_X = 1 if broadcast_X else channel + channel_Y = channel if broadcast_X else 1 + batch_size_X = 1 if broadcast_X else batch_size + batch_size_Y = batch_size if broadcast_X else 1 + + if transpose_X and transpose_Y: + shape_x = [batch_size_X, channel_X, input_dim, 32] + shape_y = [batch_size_Y, channel_Y, 64, input_dim] + elif transpose_X: + shape_x = [batch_size_X, channel_X, input_dim, 32] + shape_y = [batch_size_Y, channel_Y, input_dim, 64] + elif transpose_Y: + shape_x = [batch_size_X, channel_X, 32, input_dim] + shape_y = [batch_size_Y, channel_Y, 8, input_dim] + else: + shape_x = [batch_size_X, channel_X, 32, input_dim] + shape_y = [batch_size_Y, channel_Y, input_dim, 16] + + if type == 'X': + return np.random.random(shape_x).astype(np.float32) + else: + return np.random.random(shape_y).astype(np.float32) + + matmul_op = OpConfig(type='matmul_v2', + inputs={ + 'X': ['matmul_X'], + 'Y': ['matmul_Y'] + }, + outputs={'Out': ['matmul_output']}, + attrs={ + 'trans_x': transpose_X, + 'trans_y': transpose_Y + }) + + if activation_type == 'relu6': + activation_op = OpConfig(activation_type, + inputs={'X': ['matmul_output']}, + outputs={'Out': ['activation_output']}, + threshold=draw( + st.floats(min_value=1.0, + max_value=10.0))) + elif activation_type == 'leaky_relu': + activation_op = OpConfig(activation_type, + inputs={'X': ['matmul_output']}, + outputs={'Out': ['activation_output']}, + alpha=draw( + st.floats(min_value=0.1, + max_value=1.0))) + elif activation_type == 'swish': + activation_op = OpConfig(activation_type, + inputs={'X': ['matmul_output']}, + outputs={'Out': ['activation_output']}, + beta=draw( + st.floats(min_value=0.1, + max_value=1.0))) + elif activation_type == 'clip': + activation_op = OpConfig( + activation_type, + inputs={'X': ['matmul_output']}, + outputs={'Out': ['activation_output']}, + min=draw(st.floats(min_value=0.1, max_value=0.49)), + max=draw(st.floats(min_value=0.5, max_value=1.0))) + else: + activation_op = OpConfig(activation_type, + inputs={'X': ['matmul_output']}, + outputs={'Out': ['activation_output']}) + + model_net = [matmul_op, activation_op] + + program_config = ProgramConfig( + ops=model_net, + weights={}, + inputs={ + 'matmul_X': TensorConfig(data_gen=partial(generate_input, 'X')), + 'matmul_Y': TensorConfig(data_gen=partial(generate_input, 'Y')) + }, + outputs=['activation_output']) + + return program_config + + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_mkldnn=True) + yield config, ['matmul_v2'], (1e-5, 1e-5) + + def test(self): + self.run_and_statis(quant=False, + max_examples=30, + passes=['matmul_activation_mkldnn_fuse_pass']) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py new file mode 100644 index 0000000000000..03f2867948e91 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py @@ -0,0 +1,101 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_scan_test import PassAutoScanTest +from program_config import TensorConfig, ProgramConfig, OpConfig +import numpy as np +from functools import partial +import unittest +import hypothesis.strategies as st + + +class TestMatmulV2ElementwiseAddMkldnnFusePass(PassAutoScanTest): + + def sample_program_config(self, draw): + axis = draw(st.sampled_from([-1, 0, 1])) + matmul_as_x = draw(st.booleans()) + batch_size = draw(st.integers(min_value=2, max_value=4)) + channel = draw(st.sampled_from([16, 32, 64])) + input_dim_shared = draw(st.sampled_from([16, 32, 64])) + input_dim_X = draw(st.sampled_from([16, 32, 64])) + input_dim_Y = draw(st.sampled_from([16, 32, 64])) + + def generate_input(type): + broadcast_X = st.booleans() + channel_X = 1 if broadcast_X else channel + channel_Y = channel if broadcast_X else 1 + batch_size_X = 1 if broadcast_X else batch_size + batch_size_Y = batch_size if broadcast_X else 1 + + shape_x = [batch_size_X, channel_X, input_dim_X, input_dim_shared] + shape_y = [batch_size_Y, channel_Y, input_dim_shared, input_dim_Y] + + if type == 'X': + return np.random.random(shape_x).astype(np.float32) + elif type == 'Y': + return np.random.random(shape_y).astype(np.float32) + else: + shape_out = [batch_size, channel, input_dim_X, input_dim_Y] + return np.random.random(shape_out).astype(np.float32) + + matmul_op = OpConfig(type='matmul_v2', + inputs={ + 'X': ['matmul_X'], + 'Y': ['matmul_Y'] + }, + outputs={'Out': ['matmul_output']}, + attrs={'use_mkldnn': True}) + + if matmul_as_x: + inputs = {'X': ['matmul_output'], 'Y': ['elementwise_addend']} + else: + inputs = {'X': ['elementwise_addend'], 'Y': ['matmul_output']} + + elt_add_op = OpConfig(type='elementwise_add', + inputs=inputs, + outputs={'Out': ['elementwise_add_output']}, + attrs={ + 'axis': axis, + 'use_mkldnn': True + }) + + model_net = [matmul_op, elt_add_op] + + program_config = ProgramConfig( + ops=model_net, + weights={}, + inputs={ + 'matmul_X': + TensorConfig(data_gen=partial(generate_input, 'X')), + 'matmul_Y': + TensorConfig(data_gen=partial(generate_input, 'Y')), + 'elementwise_addend': + TensorConfig(data_gen=partial(generate_input, 'ElAdd')) + }, + outputs=['elementwise_add_output']) + + return program_config + + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_mkldnn=True) + yield config, ['matmul_v2'], (1e-5, 1e-5) + + def test(self): + self.run_and_statis(quant=False, + max_examples=30, + passes=['matmul_elementwise_add_mkldnn_fuse_pass']) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py index 28fe916a6ef02..0e24c4a394fdc 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py @@ -142,8 +142,8 @@ def sample_predictor_configs(self, program_config): yield config, [fused_op], (1e-5, 1e-5) def test(self): - self.run_and_statis(quant=False, - passes=["matmul_v2_transpose_reshape_fuse_pass"]) + self.run_and_statis( + quant=False, passes=["matmul_transpose_reshape_mkldnn_fuse_pass"]) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py index fb8dc034bd56b..cc699a5e27a51 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py @@ -29,7 +29,7 @@ class TestReshapeTransposeMatmulV2OneDNNFusePass(InferencePassTest): def setUp(self): self.set_params() self.tranpose_perm = [0, 2, 1, 3] - self.pass_name = 'reshape_transpose_matmul_v2_mkldnn_fuse_pass' + self.pass_name = 'reshape_transpose_matmul_mkldnn_fuse_pass' with fluid.program_guard(self.main_program, self.startup_program): data = fluid.data(name="data", diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_rnn.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_rnn.py new file mode 100644 index 0000000000000..2a3c25bab11ca --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_rnn.py @@ -0,0 +1,253 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set +import unittest +import os + + +class TrtConvertSliceTest(TrtLayerAutoScanTest): + + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + self.trt_param.workspace_size = 1073741824 + for hidden_size in [30]: + for input_size in [30]: + for batch in [2]: + for seq_len in [5]: + for num_layers in [1, 2]: + for is_bidirec in [True, False]: + dics = [] + dics.append({ + "hidden_size": hidden_size, + "input_size": input_size, + "num_layers": num_layers, + "mode": "LSTM", + "is_bidirec": is_bidirec, + "is_test": True, + "dropout_prob": 0.0, + # for my convience + "batch": batch, + "seq_len": seq_len, + }) + + K = 1 + if (dics[0]["is_bidirec"]): + K = 2 + + def generate_input1(): + return np.random.random([ + batch, seq_len, input_size + ]).astype(np.float32) * 2 - 1 + + # initial input -> hidden + def generate_w0(): + return np.random.random([ + 4 * hidden_size, input_size + ]).astype(np.float32) * 2 - 1 + + # prev layer's output -> hidden + def generate_w1(): + return np.random.random([ + 4 * hidden_size, K * hidden_size + ]).astype(np.float32) * 2 - 1 + + # + def generate_w2(): + return np.random.random([ + 4 * hidden_size, hidden_size + ]).astype(np.float32) * 2 - 1 + + def generate_b(): + return np.random.random([ + 4 * hidden_size + ]).astype(np.float32) * 2 - 1 + + dics.append({ + "dtype": + 5, + "input_dim_idx": + 0, + "str_value": + "0.0", + "shape": [K * num_layers, -1, hidden_size], + "output_dim_idx": + 1, + }) + dics.append({"axis": [1, 0, 2]}) + # set weights + WeightList = [ + "weight" + str(i) + for i in range(4 * K * + dics[0]["num_layers"]) + ] + weights = {} + for i in range((int)(len(WeightList) / 2)): + # mean this weight : input->hidden + # input has 2 case: initial input input_size, K * hidden form the prev layer. + if (i % 2 == 0): + if (i <= K): + weights[ + WeightList[i]] = TensorConfig( + data_gen=partial( + generate_w0)) + else: + weights[ + WeightList[i]] = TensorConfig( + data_gen=partial( + generate_w1)) + # mean this weight : hidden->hidden + if (i % 2 == 1): + weights[WeightList[i]] = TensorConfig( + data_gen=partial(generate_w2)) + for i in range((int)(len(WeightList) / 2), + len(WeightList)): + weights[WeightList[i]] = TensorConfig( + data_gen=partial(generate_b)) + ops_config = [ + { + "op_type": + "fill_constant_batch_size_like", + "op_inputs": { + "Input": ["input_data"] + }, + "op_outputs": { + "Out": ["prestate1"] + }, + "op_attrs": dics[1] + }, + { + "op_type": + "fill_constant_batch_size_like", + "op_inputs": { + "Input": ["input_data"] + }, + "op_outputs": { + "Out": ["prestate2"] + }, + "op_attrs": dics[1] + }, + { + "op_type": "transpose2", + "op_inputs": { + "X": ["input_data"] + }, + "op_outputs": { + "Out": ["rnn_input_data"] + }, + "op_attrs": dics[2] + }, + { + "op_type": "rnn", + "op_inputs": { + "Input": ["rnn_input_data"], + # prev_c, prev_h + "PreState": + ["prestate1", "prestate2"], + "WeightList": WeightList, + }, + "op_outputs": { + "Out": ["rnn_output_data"], + "State": [ + "state_output_data0", + "state_output_data1" + ], + "Reserve": ["reserve_data"], + "DropoutState": + ["DropoutState_data"] + }, + "op_attrs": dics[0] + } + ] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights=weights, + inputs={ + "input_data": + TensorConfig( + data_gen=partial(generate_input1)) + }, + outputs=["rnn_output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + num_layers = attrs[3]["num_layers"] + hidden_size = attrs[3]["hidden_size"] + batch = attrs[3]["batch"] + input_size = attrs[3]["input_size"] + seq_len = attrs[3]["seq_len"] + + K = 1 + if attrs[3]["is_bidirec"]: + K = 2 + + def generate_dynamic_shape(attrs): + self.dynamic_shape.min_input_shape = { + "input_data": [batch - 1, seq_len, input_size], + } + self.dynamic_shape.max_input_shape = { + "input_data": [batch + 1, seq_len, input_size], + } + self.dynamic_shape.opt_input_shape = { + "input_data": [batch, seq_len, input_size], + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + return 1, 2 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + # The output has diff between gpu and trt in PR-CI-Windows-Inference + tol_fp32 = 1e-5 + tol_half = 1e-2 + if (os.name == 'nt'): + tol_fp32 = 1e-2 + tol_half = 1e-1 + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), tol_fp32 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), tol_half + + def test(self): + self.run_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py index 4354883a44274..3ac6e23d21a4d 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py @@ -303,8 +303,8 @@ def _test(self, run_mlu=True): def test_mlu(self): mlu_pred, mlu_loss = self._test(True) cpu_pred, cpu_loss = self._test(False) - self.assertTrue(np.allclose(mlu_pred, cpu_pred, rtol=1e-3)) - self.assertTrue(np.allclose(mlu_loss, cpu_loss, rtol=1e-3)) + np.testing.assert_allclose(mlu_pred, cpu_pred, rtol=1e-3) + np.testing.assert_allclose(mlu_loss, cpu_loss, rtol=1e-3) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py index 5c69cdb74093a..e60664f27a0e1 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py @@ -249,8 +249,8 @@ def _test(self, run_mlu=True): def test_mlu(self): mlu_pred, mlu_loss = self._test(True) cpu_pred, cpu_loss = self._test(False) - self.assertTrue(np.allclose(mlu_pred, cpu_pred, rtol=1e-3)) - self.assertTrue(np.allclose(mlu_loss, cpu_loss, rtol=1e-3)) + np.testing.assert_allclose(mlu_pred, cpu_pred, rtol=1e-3) + np.testing.assert_allclose(mlu_loss, cpu_loss, rtol=1e-3) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py index 86f044b9d3dad..0f50da6b4f059 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py @@ -219,7 +219,10 @@ def setUp(self): self.init_kernel_type() def __assert_close(self, tensor, np_array, msg, atol=1e-4): - self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) + np.testing.assert_allclose(np.array(tensor), + np_array, + atol=atol, + err_msg=msg) def check_with_place(self, place, data_layout, dtype, shape): epsilon = 0.00001 @@ -672,7 +675,7 @@ def compute(x, is_test, trainable_statistics): x = np.random.randn(*shape).astype("float32") y1 = compute(x, False, False) y2 = compute(x, True, True) - self.assertTrue(np.allclose(y1, y2)) + np.testing.assert_allclose(y1, y2) def test_static(self): places = [fluid.CPUPlace()] @@ -697,7 +700,7 @@ def compute(x_np, is_test, trainable_statistics): x = np.random.randn(*shape).astype("float32") y1 = compute(x, False, False) y2 = compute(x, True, True) - self.assertTrue(np.allclose(y1, y2)) + np.testing.assert_allclose(y1, y2) class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py index b0fec2bdd0f6a..b4f58a7c5f018 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py @@ -137,8 +137,8 @@ def compute_v4(x): y2 = compute_v2(x) y3 = compute_v3(x, False, False) y4 = compute_v4(x) - self.assertTrue(np.allclose(y1, y2)) - self.assertTrue(np.allclose(y3, y4)) + np.testing.assert_allclose(y1, y2) + np.testing.assert_allclose(y3, y4) def test_static(self): places = [fluid.CPUPlace()] @@ -172,7 +172,7 @@ def compute_v2(x_np): x = np.random.randn(*shape).astype("float32") y1 = compute_v1(x, False, False) y2 = compute_v2(x) - self.assertTrue(np.allclose(y1, y2)) + np.testing.assert_allclose(y1, y2) class TestBatchNormChannelLast(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/mlu/test_bce_loss_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_bce_loss_mlu.py index 78dd988aa7ef8..3805d27a14f04 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_bce_loss_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_bce_loss_mlu.py @@ -58,7 +58,7 @@ def test_static_layer(place, "weight": weight_np }, fetch_list=[res]) - return static_result + return static_result[0] def test_static_functional(place, @@ -98,7 +98,7 @@ def test_static_functional(place, "weight": weight_np }, fetch_list=[res]) - return static_result + return static_result[0] def test_dygraph_layer(place, @@ -174,16 +174,18 @@ def test_BCELoss(self): dy_result = test_dygraph_layer(place, input_np, label_np, reduction) expected = calc_bceloss(input_np, label_np, reduction) - self.assertTrue(np.allclose(static_result, expected)) - self.assertTrue(np.allclose(static_result, dy_result)) - self.assertTrue(np.allclose(dy_result, expected)) + np.testing.assert_allclose(static_result, expected, rtol=1e-6) + np.testing.assert_allclose(static_result, dy_result) + np.testing.assert_allclose(dy_result, expected, rtol=1e-6) static_functional = test_static_functional( place, input_np, label_np, reduction) dy_functional = test_dygraph_functional(place, input_np, label_np, reduction) - self.assertTrue(np.allclose(static_functional, expected)) - self.assertTrue(np.allclose(static_functional, dy_functional)) - self.assertTrue(np.allclose(dy_functional, expected)) + np.testing.assert_allclose(static_functional, + expected, + rtol=1e-6) + np.testing.assert_allclose(static_functional, dy_functional) + np.testing.assert_allclose(dy_functional, expected, rtol=1e-6) def test_BCELoss_weight(self): input_np = np.random.uniform(0.1, 0.8, @@ -207,9 +209,9 @@ def test_BCELoss_weight(self): label_np, reduction, weight_np=weight_np) - self.assertTrue(np.allclose(static_result, expected)) - self.assertTrue(np.allclose(static_result, dy_result)) - self.assertTrue(np.allclose(dy_result, expected)) + np.testing.assert_allclose(static_result, expected, rtol=1e-6) + np.testing.assert_allclose(static_result, dy_result) + np.testing.assert_allclose(dy_result, expected, rtol=1e-6) static_functional = test_static_functional(place, input_np, label_np, @@ -220,9 +222,9 @@ def test_BCELoss_weight(self): label_np, reduction, weight_np=weight_np) - self.assertTrue(np.allclose(static_functional, expected)) - self.assertTrue(np.allclose(static_functional, dy_functional)) - self.assertTrue(np.allclose(dy_functional, expected)) + np.testing.assert_allclose(static_functional, expected, rtol=1e-6) + np.testing.assert_allclose(static_functional, dy_functional) + np.testing.assert_allclose(dy_functional, expected, rtol=1e-6) def test_BCELoss_error(self): paddle.disable_static() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_bce_with_logits_loss_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_bce_with_logits_loss_mlu.py index 42989a5c44b10..6b0b91cce9383 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_bce_with_logits_loss_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_bce_with_logits_loss_mlu.py @@ -61,7 +61,7 @@ def test_static(place, res = call_bce_layer(logit, label, weight, reduction, pos_weight) exe = paddle.static.Executor(place) static_result = exe.run(prog, feed=feed_dict, fetch_list=[res]) - return static_result + return static_result[0] paddle.enable_static() @@ -86,9 +86,9 @@ def test_BCEWithLogitsLoss(self): reduction=reduction) expected = calc_bce_with_logits_loss(logit_np, label_np, reduction) - self.assertTrue(np.allclose(static_result, expected)) - self.assertTrue(np.allclose(static_result, dy_result)) - self.assertTrue(np.allclose(dy_result, expected)) + np.testing.assert_allclose(static_result, expected, rtol=1e-6) + np.testing.assert_allclose(static_result, dy_result) + np.testing.assert_allclose(dy_result, expected, rtol=1e-6) static_functional = test_static(place, logit_np, label_np, @@ -100,9 +100,11 @@ def test_BCEWithLogitsLoss(self): reduction=reduction, functional=True) - self.assertTrue(np.allclose(static_functional, expected)) - self.assertTrue(np.allclose(static_functional, dy_functional)) - self.assertTrue(np.allclose(dy_functional, expected)) + np.testing.assert_allclose(static_functional, + expected, + rtol=1e-6) + np.testing.assert_allclose(static_functional, dy_functional) + np.testing.assert_allclose(dy_functional, expected, rtol=1e-6) def test_BCEWithLogitsLoss_weight(self): logit_np = np.random.uniform(0.1, 0.8, @@ -126,9 +128,9 @@ def test_BCEWithLogitsLoss_weight(self): label_np, reduction, weight_np=weight_np) - self.assertTrue(np.allclose(static_result, expected)) - self.assertTrue(np.allclose(static_result, dy_result)) - self.assertTrue(np.allclose(dy_result, expected)) + np.testing.assert_allclose(static_result, expected, rtol=1e-6) + np.testing.assert_allclose(static_result, dy_result) + np.testing.assert_allclose(dy_result, expected, rtol=1e-6) static_functional = test_static(place, logit_np, label_np, @@ -141,9 +143,9 @@ def test_BCEWithLogitsLoss_weight(self): weight_np=weight_np, reduction=reduction, functional=True) - self.assertTrue(np.allclose(static_functional, expected)) - self.assertTrue(np.allclose(static_functional, dy_functional)) - self.assertTrue(np.allclose(dy_functional, expected)) + np.testing.assert_allclose(static_functional, expected, rtol=1e-6) + np.testing.assert_allclose(static_functional, dy_functional) + np.testing.assert_allclose(dy_functional, expected, rtol=1e-6) def test_BCEWithLogitsLoss_pos_weight(self): logit_np = np.random.uniform(0.1, 0.8, @@ -160,9 +162,9 @@ def test_BCEWithLogitsLoss_pos_weight(self): reduction, pos_weight_np) expected = calc_bce_with_logits_loss(logit_np, label_np, reduction, weight_np, pos_weight_np) - self.assertTrue(np.allclose(static_result, expected)) - self.assertTrue(np.allclose(static_result, dy_result)) - self.assertTrue(np.allclose(dy_result, expected)) + np.testing.assert_allclose(static_result, expected) + np.testing.assert_allclose(static_result, dy_result) + np.testing.assert_allclose(dy_result, expected) static_functional = test_static(place, logit_np, label_np, @@ -177,9 +179,9 @@ def test_BCEWithLogitsLoss_pos_weight(self): reduction, pos_weight_np, functional=True) - self.assertTrue(np.allclose(static_functional, expected)) - self.assertTrue(np.allclose(static_functional, dy_functional)) - self.assertTrue(np.allclose(dy_functional, expected)) + np.testing.assert_allclose(static_functional, expected) + np.testing.assert_allclose(static_functional, dy_functional) + np.testing.assert_allclose(dy_functional, expected) def test_BCEWithLogitsLoss_error(self): paddle.disable_static() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py index d7e53639490d2..ece4c2f3304ab 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py @@ -563,7 +563,7 @@ def test_case(self): out_w=12, align_corners=True) for res in results: - self.assertTrue(np.allclose(res, expect_res)) + np.testing.assert_allclose(res, expect_res, rtol=1e-6) class TestBilinearInterpOpAPI_dy(unittest.TestCase): @@ -585,7 +585,7 @@ def test_case(self): size=[12, 12], mode="bilinear", align_corners=False) - self.assertTrue(np.allclose(out.numpy(), expect_res)) + np.testing.assert_allclose(out.numpy(), expect_res, rtol=1e-6) class TestBilinearInterpOpAPI_dy2(unittest.TestCase): @@ -609,7 +609,7 @@ def test_case(self): size=size, mode="bilinear", align_corners=False) - self.assertTrue(np.allclose(out.numpy(), expect_res)) + np.testing.assert_allclose(out.numpy(), expect_res, rtol=1e-6) class TestBilinearInterpOpAPI_dy3(unittest.TestCase): @@ -633,7 +633,7 @@ def test_case(self): size=[size, size], mode="bilinear", align_corners=False) - self.assertTrue(np.allclose(out.numpy(), expect_res)) + np.testing.assert_allclose(out.numpy(), expect_res, rtol=1e-6) class TestBilinearInterpOpAPI_dy4(unittest.TestCase): @@ -658,7 +658,7 @@ def test_case(self): mode="bilinear", align_corners=False) - self.assertTrue(np.allclose(out.numpy(), expect_res)) + np.testing.assert_allclose(out.numpy(), expect_res, rtol=1e-6) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py index 04332b061f885..1b3ce96111573 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py @@ -210,22 +210,26 @@ def check_with_place(self, input2 = np.random.random((10, 1000)).astype(np_data_type) if col_type == "broadcast": need_result = input2 - self.assertTrue(np.allclose(tr0_out, need_result)) - self.assertTrue(np.allclose(tr1_out, need_result)) + np.testing.assert_allclose(tr0_out, need_result) + np.testing.assert_allclose(tr1_out, need_result) elif col_type == "allreduce": need_result = input1 + input2 - self.assertTrue( - np.allclose(tr0_out, need_result, rtol=1e-05, atol=1e-05)) - self.assertTrue( - np.allclose(tr1_out, need_result, rtol=1e-05, atol=1e-05)) + np.testing.assert_allclose(tr0_out, + need_result, + rtol=1e-05, + atol=1e-05) + np.testing.assert_allclose(tr1_out, + need_result, + rtol=1e-05, + atol=1e-05) elif col_type == "reduce": need_result = input1 + input2 - self.assertTrue(np.allclose(tr0_out, need_result)) + np.testing.assert_allclose(tr0_out, need_result) elif col_type == "allgather": need_result = np.vstack((input1, input2)) tr_out0 = np.vstack((tr0_out[0], tr0_out[1])) tr_out1 = np.vstack((tr1_out[0], tr1_out[1])) - self.assertTrue(np.allclose(tr_out0, need_result)) - self.assertTrue(np.allclose(tr_out1, need_result)) + np.testing.assert_allclose(tr_out0, need_result) + np.testing.assert_allclose(tr_out1, need_result) else: pass diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py index 4ec1e7f7528bb..47fb3a1a2305c 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py @@ -259,47 +259,63 @@ def check_with_place(self, input2 = np.random.random((10, 1000)).astype(np_data_type) if col_type == "broadcast": need_result = input2 - self.assertTrue(np.allclose(tr0_out, need_result)) - self.assertTrue(np.allclose(tr1_out, need_result)) + np.testing.assert_allclose(tr0_out, need_result) + np.testing.assert_allclose(tr1_out, need_result) elif col_type == "allreduce_sum": need_result = input1 + input2 - self.assertTrue( - np.allclose(tr0_out, need_result, rtol=1e-05, atol=1e-05)) - self.assertTrue( - np.allclose(tr1_out, need_result, rtol=1e-05, atol=1e-05)) + np.testing.assert_allclose(tr0_out, + need_result, + rtol=1e-05, + atol=1e-05) + np.testing.assert_allclose(tr1_out, + need_result, + rtol=1e-05, + atol=1e-05) elif col_type == "allreduce_prod": need_result = input1 * input2 - self.assertTrue( - np.allclose(tr0_out, need_result, rtol=1e-05, atol=1e-05)) - self.assertTrue( - np.allclose(tr1_out, need_result, rtol=1e-05, atol=1e-05)) + np.testing.assert_allclose(tr0_out, + need_result, + rtol=1e-05, + atol=1e-05) + np.testing.assert_allclose(tr1_out, + need_result, + rtol=1e-05, + atol=1e-05) elif col_type == "allreduce_max": need_result = np.maximum(input1, input2) - self.assertTrue( - np.allclose(tr0_out, need_result, rtol=1e-05, atol=1e-05)) - self.assertTrue( - np.allclose(tr1_out, need_result, rtol=1e-05, atol=1e-05)) + np.testing.assert_allclose(tr0_out, + need_result, + rtol=1e-05, + atol=1e-05) + np.testing.assert_allclose(tr1_out, + need_result, + rtol=1e-05, + atol=1e-05) elif col_type == "allreduce_min": need_result = np.minimum(input1, input2) - self.assertTrue( - np.allclose(tr0_out, need_result, rtol=1e-05, atol=1e-05)) - self.assertTrue( - np.allclose(tr1_out, need_result, rtol=1e-05, atol=1e-05)) + np.testing.assert_allclose(tr0_out, + need_result, + rtol=1e-05, + atol=1e-05) + np.testing.assert_allclose(tr1_out, + need_result, + rtol=1e-05, + atol=1e-05) elif col_type == "reduce_sum": need_result = input1 + input2 - self.assertTrue(np.allclose(tr1_out, need_result)) + np.testing.assert_allclose(tr1_out, need_result) elif col_type == "reduce_prod": need_result = input1 * input2 - self.assertTrue(np.allclose(tr1_out, need_result)) + np.testing.assert_allclose(tr1_out, need_result) elif col_type == "reduce_max": need_result = np.maximum(input1, input2) - self.assertTrue(np.allclose(tr1_out, need_result)) + np.testing.assert_allclose(tr1_out, need_result) elif col_type == "reduce_min": need_result = np.minimum(input1, input2) - self.assertTrue(np.allclose(tr1_out, need_result)) + np.testing.assert_allclose(tr1_out, need_result) elif col_type == "allgather": need_result = np.vstack((input1, input2)) - self.assertTrue(np.allclose(tr0_out, need_result)) - self.assertTrue(np.allclose(tr1_out, need_result)) + np.testing.assert_allclose(tr0_out, need_result) + np.testing.assert_allclose(tr1_out, need_result) else: pass diff --git a/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py index e9d172c89410e..8497853561d87 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py @@ -268,11 +268,11 @@ def check_static_result(self, place): fetches = exe.run(fluid.default_main_program(), feed={"input": in_np}, fetch_list=[res]) - self.assertTrue(np.allclose(fetches[0], res_np)) + np.testing.assert_allclose(fetches[0], res_np) fetches2 = exe.run(fluid.default_main_program(), feed={"input": in_np}, fetch_list=[res6]) - self.assertTrue(np.allclose(fetches2[0], res_np2)) + np.testing.assert_allclose(fetches2[0], res_np2) def test_static(self): for place in self.places: diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_max_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_max_op_mlu.py index 6ecbf51c28af8..a1be1152d16fe 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_max_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_max_op_mlu.py @@ -361,8 +361,8 @@ def test_mlu(self): cpu_pred, cpu_loss = self._test(False) mlu_pred, mlu_loss = self._test(True) - self.assertTrue(np.allclose(mlu_pred, cpu_pred)) - self.assertTrue(np.allclose(mlu_loss, cpu_loss)) + np.testing.assert_allclose(mlu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(mlu_loss, cpu_loss) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_min_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_min_op_mlu.py index f04f0eb781e5d..20bd124e4ab0a 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_min_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_min_op_mlu.py @@ -222,8 +222,8 @@ def test_mlu(self): cpu_pred, cpu_loss = self._test(False) mlu_pred, mlu_loss = self._test(True) - self.assertTrue(np.allclose(mlu_pred, cpu_pred)) - self.assertTrue(np.allclose(mlu_loss, cpu_loss)) + np.testing.assert_allclose(mlu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(mlu_loss, cpu_loss) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/mlu/test_expand_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_expand_v2_op_mlu.py index cbc99c2fa6686..19aad53f1d3aa 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_expand_v2_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_expand_v2_op_mlu.py @@ -302,7 +302,7 @@ def test_expand_times_is_tensor(self): expand_1 = paddle.expand(a, shape=[2, 5]) np_array = np.array([2, 5]) expand_2 = paddle.expand(a, shape=np_array) - self.assertTrue(np.array_equal(expand_1.numpy(), expand_2.numpy())) + np.testing.assert_allclose(expand_1.numpy(), expand_2.numpy()) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py index 604dbf4ddbcce..d78c9405e3671 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py @@ -122,7 +122,7 @@ def check_with_place(self, place): result_array = np.array(out.get_tensor()) full_array = np.full((123, 92), 3.8, 'float32') - self.assertTrue(np.array_equal(result_array, full_array)) + np.testing.assert_allclose(result_array, full_array) def test_fill_constant_with_selected_rows(self): places = [core.CPUPlace()] diff --git a/python/paddle/fluid/tests/unittests/mlu/test_gather_nd_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_gather_nd_op_mlu.py index deee1a38b3101..47d74e97f9a16 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_gather_nd_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_gather_nd_op_mlu.py @@ -292,7 +292,7 @@ def test_imperative(self): output = paddle.fluid.layers.gather(input, index) output_np = output.numpy() expected_output = np.array([3, 4]) - self.assertTrue(np.allclose(output_np, expected_output)) + np.testing.assert_allclose(output_np[0], expected_output, rtol=1e-6) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_gather_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_gather_op_mlu.py index 6c6ddda303d4e..bc59b3d0faffd 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_gather_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_gather_op_mlu.py @@ -101,7 +101,7 @@ def test_out1(self): output = paddle.fluid.layers.gather(input, index) output_np = output.numpy() expected_output = np.array([[3, 4], [5, 6]]).astype('int32') - self.assertTrue(np.allclose(output_np, expected_output)) + np.testing.assert_allclose(output_np, expected_output) paddle.enable_static() def test_out12(self): @@ -113,7 +113,7 @@ def test_out12(self): output = paddle.gather(x, index, axis=0) output_np = output.numpy() expected_output = gather_numpy(input_1, index_1, axis=0) - self.assertTrue(np.allclose(output_np, expected_output)) + np.testing.assert_allclose(output_np, expected_output) paddle.enable_static() def test_zero_index(self): diff --git a/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py index 9f755de687234..acc711ffdbd9a 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py @@ -65,8 +65,12 @@ def verify_output(self, outs): hist2, _ = np.histogram(data, range=(-3, 5)) hist2 = hist2.astype("float32") hist2 /= float(outs[0].size) - self.assertTrue(np.allclose(hist, hist2, rtol=0, atol=0.01), - "hist: " + str(hist) + " hist2: " + str(hist2)) + np.testing.assert_allclose(hist, + hist2, + rtol=0, + atol=0.01, + err_msg="hist: " + str(hist) + " hist2: " + + str(hist2)) class TestMeanStdAreInt(TestGaussianRandomOp): diff --git a/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py index 2cf89789bfc8b..4d6dc9c8b5d93 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py @@ -150,8 +150,8 @@ def test_mlu(self): cpu_pred, cpu_loss = self._test(False) mlu_pred, mlu_loss = self._test(True) - self.assertTrue(np.allclose(mlu_pred, cpu_pred, atol=1e-3)) - self.assertTrue(np.allclose(mlu_loss, cpu_loss, atol=1e-3)) + np.testing.assert_allclose(mlu_pred, cpu_pred, atol=1e-3) + np.testing.assert_allclose(mlu_loss, cpu_loss, atol=1e-3) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py index 5050e2006f333..5fd9ea9fcdc74 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py @@ -147,7 +147,7 @@ def test_static_api(self): res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2]) out_ref = ref_hardsigmoid(self.x_np) for r in res: - self.assertTrue(np.allclose(out_ref, r)) + np.testing.assert_allclose(out_ref, r, rtol=1e-6) def test_dygraph_api(self): paddle.disable_static(self.place) @@ -157,22 +157,23 @@ def test_dygraph_api(self): out2 = m(x) out_ref = ref_hardsigmoid(self.x_np) for r in [out1, out2]: - self.assertTrue(np.allclose(out_ref, r.numpy())) + np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-6) paddle.enable_static() def test_fluid_api(self): + paddle.enable_static() with fluid.program_guard(fluid.Program()): x = fluid.data('X', self.x_np.shape, self.x_np.dtype) out = fluid.layers.hard_sigmoid(x) exe = fluid.Executor(self.place) res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) out_ref = ref_hardsigmoid(self.x_np, 0.2, 0.5) - self.assertTrue(np.allclose(out_ref, res[0])) + np.testing.assert_allclose(out_ref, res[0]) paddle.disable_static(self.place) x = paddle.to_tensor(self.x_np) out = paddle.fluid.layers.hard_sigmoid(x) - self.assertTrue(np.allclose(out_ref, out.numpy())) + np.testing.assert_allclose(out_ref, out.numpy()) paddle.enable_static() def test_errors(self): diff --git a/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py index 5df59be28a87b..e5514285ba1f0 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py @@ -45,7 +45,11 @@ def setUp(self): self.__class__.use_mlu = True def __assert_close(self, tensor, np_array, msg, atol=1e-4): - self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) + np.testing.assert_allclose(np.array(tensor), + np_array, + rtol=1e-5, + atol=atol, + err_msg=msg) def check_forward_backward(self, shape, @@ -152,11 +156,11 @@ def test_with_place(place, 1e-3) self.__assert_close(x_grad, out[3], "x_grad") if has_scale: - self.__assert_close(scale_grad, + self.__assert_close(scale_grad.reshape(-1), out[fetch_list.index('scale@GRAD')], "scale_grad", 1e-3) if has_bias: - self.__assert_close(bias_grad, + self.__assert_close(bias_grad.reshape(-1), out[fetch_list.index('bias@GRAD')], "bias_grad") @@ -287,7 +291,7 @@ def test_main(self): x_np, weight_np, bias_np, 'float32') def assert_equal(x, y): - self.assertTrue(np.array_equal(x, y)) + np.testing.assert_allclose(x, y) assert_equal(y_np_1, y_np_2) assert_equal(x_g_np_1, x_g_np_2) diff --git a/python/paddle/fluid/tests/unittests/mlu/test_leaky_relu_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_leaky_relu_op_mlu.py index 0aad79eb61f92..edf5d2bb28410 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_leaky_relu_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_leaky_relu_op_mlu.py @@ -145,8 +145,8 @@ def test_mlu(self): cpu_pred, cpu_loss = self._test(False) mlu_pred, mlu_loss = self._test(True) - self.assertTrue(np.allclose(mlu_pred, cpu_pred)) - self.assertTrue(np.allclose(mlu_loss, cpu_loss)) + np.testing.assert_allclose(mlu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(mlu_loss, cpu_loss) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/mlu/test_log_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_log_op_mlu.py index 82aeb577205d5..6f068c341aab7 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_log_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_log_op_mlu.py @@ -123,7 +123,7 @@ def test_api(self): feed={"data_x": input_x}, fetch_list=[out1]) expected_res = np.log2(input_x) - self.assertTrue(np.allclose(res1, expected_res)) + np.testing.assert_allclose(res1[0], expected_res, rtol=1e-6) # dygraph with fluid.dygraph.guard(): @@ -134,7 +134,7 @@ def test_api(self): z_expected = np.array(np.log2(np_x)) np.savetxt("np_z.txt", np_z.flatten(), fmt="%.4f") np.savetxt("z_expected.txt", z_expected.flatten(), fmt="%.4f") - self.assertTrue(np.allclose(np_z, z_expected, atol=1e-6)) + np.testing.assert_allclose(np_z, z_expected, atol=1e-6) class TestLog10(TestActivation): @@ -173,7 +173,7 @@ def test_api(self): feed={"data_x": input_x}, fetch_list=[out1]) expected_res = np.log10(input_x) - self.assertTrue(np.allclose(res1, expected_res)) + np.testing.assert_allclose(res1[0], expected_res, rtol=1e-6) # dygraph with fluid.dygraph.guard(): @@ -182,7 +182,7 @@ def test_api(self): z = paddle.log10(data_x) np_z = z.numpy() z_expected = np.array(np.log10(np_x)) - self.assertTrue(np.allclose(np_z, z_expected)) + np.testing.assert_allclose(np_z, z_expected, rtol=1e-4) class TestLogHalf(TestLog): diff --git a/python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py index 1b81455f47797..91e8a86ce6856 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py @@ -136,19 +136,20 @@ def check_api(self, axis=-1): ref_out = np.apply_along_axis(ref_log_softmax, axis, self.x) logsoftmax = paddle.nn.LogSoftmax(axis) + paddle.enable_static() # test static api with paddle.static.program_guard(paddle.static.Program()): x = paddle.fluid.data(name='x', shape=self.x_shape) y = logsoftmax(x) exe = paddle.static.Executor(self.place) out = exe.run(feed={'x': self.x}, fetch_list=[y]) - self.assertTrue(np.allclose(out[0], ref_out)) + np.testing.assert_allclose(out[0], ref_out, rtol=1e-6) # test dygrapg api paddle.disable_static() x = paddle.to_tensor(self.x) y = logsoftmax(x) - self.assertTrue(np.allclose(y.numpy(), ref_out)) + np.testing.assert_allclose(y.numpy(), ref_out, rtol=1e-6) paddle.enable_static() def test_check_api(self): @@ -177,12 +178,12 @@ def check_api(self, axis=-1, dtype=None): y = F.log_softmax(x, axis, dtype) exe = paddle.static.Executor(self.place) out = exe.run(feed={'x': self.x}, fetch_list=[y]) - self.assertTrue(np.allclose(out[0], ref_out)) + np.testing.assert_allclose(out[0], ref_out, rtol=1e-6) paddle.disable_static() x = paddle.to_tensor(self.x) y = F.log_softmax(x, axis, dtype) - self.assertTrue(np.allclose(y.numpy(), ref_out), True) + np.testing.assert_allclose(y.numpy(), ref_out, rtol=1e-6) paddle.enable_static() def test_check_api(self): @@ -191,12 +192,14 @@ def test_check_api(self): self.check_api(-1, 'float32') def test_errors(self): + paddle.enable_static() with paddle.static.program_guard(paddle.static.Program()): x = paddle.fluid.data(name='X1', shape=[100], dtype='int32') self.assertRaises(TypeError, F.log_softmax, x) x = paddle.fluid.data(name='X2', shape=[100], dtype='float32') self.assertRaises(TypeError, F.log_softmax, x, dtype='int32') + paddle.disable_static() if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py index 31eb98b7a8850..7a1f590cf3da6 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py @@ -316,7 +316,7 @@ def run_op(use_merged): outs2 = run_op(False) self.assertEqual(len(outs1), len(outs2)) for i, (out1, out2) in enumerate(zip(outs1, outs2)): - self.assertTrue(np.allclose(out1, out2, atol=1e-7)) + np.testing.assert_allclose(out1, out2, atol=1e-7) def test_main(self): self.check_with_place(self.place, multi_precision=False) @@ -370,13 +370,13 @@ def run_op(use_nesterov, use_merged): outs2 = run_op(use_nesterov=True, use_merged=False) self.assertEqual(len(outs1), len(outs2)) for i, (out1, out2) in enumerate(zip(outs1, outs2)): - self.assertTrue(np.allclose(out1, out2, atol=1e-7)) + np.testing.assert_allclose(out1, out2, atol=1e-7) outs3 = run_op(use_nesterov=False, use_merged=True) outs4 = run_op(use_nesterov=False, use_merged=False) self.assertEqual(len(outs3), len(outs4)) for j, (out3, out4) in enumerate(zip(outs3, outs4)): - self.assertTrue(np.allclose(out3, out4, atol=1e-7)) + np.testing.assert_allclose(out3, out4, atol=1e-7) def test_main(self): self.check_with_place(self.place, multi_precision=False) diff --git a/python/paddle/fluid/tests/unittests/mlu/test_nearest_interp_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_nearest_interp_v2_op_mlu.py index 59078a21d0fa8..60364818439f4 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_nearest_interp_v2_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_nearest_interp_v2_op_mlu.py @@ -594,10 +594,10 @@ def test_case(self): out_h=12, out_w=12, align_corners=False) - self.assertTrue( - np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 1)))) + np.testing.assert_allclose(results[0], + np.transpose(expect_res, (0, 2, 3, 1))) for i in range(len(results) - 1): - self.assertTrue(np.allclose(results[i + 1], expect_res)) + np.testing.assert_allclose(results[i + 1], expect_res) class TestNearestInterpException(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py index d33646cbfa32b..c3715342e7377 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py @@ -993,7 +993,7 @@ def test_nhwc(self): paddings=[0, 0], pool_type='max', data_format='NHWC') - self.assertTrue(np.allclose(out1.numpy(), out2)) + np.testing.assert_allclose(out1.numpy(), out2) def test_lower_case(self): with fluid.dygraph.guard(): @@ -1010,7 +1010,7 @@ def test_lower_case(self): paddings=[0, 0], pool_type='max', data_format='NHWC') - self.assertTrue(np.allclose(out1.numpy(), out2)) + np.testing.assert_allclose(out1.numpy(), out2) def test_upper_case(self): with fluid.dygraph.guard(): @@ -1027,7 +1027,7 @@ def test_upper_case(self): paddings=[0, 0], pool_type='max', data_format='NHWC') - self.assertTrue(np.allclose(out1.numpy(), out2)) + np.testing.assert_allclose(out1.numpy(), out2) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/mlu/test_randperm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_randperm_op_mlu.py index 445dc449236b3..5412e6c4a7b60 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_randperm_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_randperm_op_mlu.py @@ -160,61 +160,61 @@ def test_fixed_random_number(self): expect = [ 24562, 8409, 9379, 10328, 20503, 18059, 9681, 21883, 11783, 27413 ] - self.assertTrue(np.array_equal(x[0:10], expect)) + np.testing.assert_allclose(x[0:10], expect) expect = [ 29477, 27100, 9643, 16637, 8605, 16892, 27767, 2724, 1612, 13096 ] - self.assertTrue(np.array_equal(x[10000:10010], expect)) + np.testing.assert_allclose(x[10000:10010], expect) expect = [ 298, 4104, 16479, 22714, 28684, 7510, 14667, 9950, 15940, 28343 ] - self.assertTrue(np.array_equal(x[20000:20010], expect)) + np.testing.assert_allclose(x[20000:20010], expect) x = paddle.randperm(30000, dtype='int64').numpy() expect = [ 6587, 1909, 5525, 23001, 6488, 14981, 14355, 3083, 29561, 8171 ] - self.assertTrue(np.array_equal(x[0:10], expect)) + np.testing.assert_allclose(x[0:10], expect) expect = [ 23460, 12394, 22501, 5427, 20185, 9100, 5127, 1651, 25806, 4818 ] - self.assertTrue(np.array_equal(x[10000:10010], expect)) + np.testing.assert_allclose(x[10000:10010], expect) expect = [5829, 4508, 16193, 24836, 8526, 242, 9984, 9243, 1977, 11839] - self.assertTrue(np.array_equal(x[20000:20010], expect)) + np.testing.assert_allclose(x[20000:20010], expect) x = paddle.randperm(30000, dtype='float32').numpy() expect = [ 5154., 10537., 14362., 29843., 27185., 28399., 27561., 4144., 22906., 10705. ] - self.assertTrue(np.array_equal(x[0:10], expect)) + np.testing.assert_allclose(x[0:10], expect) expect = [ 1958., 18414., 20090., 21910., 22746., 27346., 22347., 3002., 4564., 26991. ] - self.assertTrue(np.array_equal(x[10000:10010], expect)) + np.testing.assert_allclose(x[10000:10010], expect) expect = [ 25580., 12606., 553., 16387., 29536., 4241., 20946., 16899., 16339., 4662. ] - self.assertTrue(np.array_equal(x[20000:20010], expect)) + np.testing.assert_allclose(x[20000:20010], expect) x = paddle.randperm(30000, dtype='float64').numpy() expect = [ 19051., 2449., 21940., 11121., 282., 7330., 13747., 24321., 21147., 9163. ] - self.assertTrue(np.array_equal(x[0:10], expect)) + np.testing.assert_allclose(x[0:10], expect) expect = [ 15483., 1315., 5723., 20954., 13251., 25539., 5074., 1823., 14945., 17624. ] - self.assertTrue(np.array_equal(x[10000:10010], expect)) + np.testing.assert_allclose(x[10000:10010], expect) expect = [ 10516., 2552., 29970., 5941., 986., 8007., 24805., 26753., 12202., 21404. ] - self.assertTrue(np.array_equal(x[20000:20010], expect)) + np.testing.assert_allclose(x[20000:20010], expect) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py index a6bb42878a684..6c2b35912dc85 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py @@ -163,8 +163,8 @@ def test_mlu(self): cpu_pred, cpu_loss = self._test(False) mlu_pred, mlu_loss = self._test(True) - self.assertTrue(np.allclose(mlu_pred, cpu_pred)) - self.assertTrue(np.allclose(mlu_loss, cpu_loss)) + np.testing.assert_allclose(mlu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(mlu_loss, cpu_loss, rtol=1e-6) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py index 495711e5303f3..53b3903e951d1 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py @@ -165,8 +165,8 @@ def test_mlu(self): cpu_pred, cpu_loss = self._test(False) mlu_pred, mlu_loss = self._test(True) - self.assertTrue(np.allclose(mlu_pred, cpu_pred)) - self.assertTrue(np.allclose(mlu_loss, cpu_loss)) + np.testing.assert_allclose(mlu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(mlu_loss, cpu_loss, rtol=1e-6) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py index aed58a352f4dc..b7ec554697615 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py @@ -185,7 +185,7 @@ def test_api(self): exe = paddle.static.Executor(place=paddle.CPUPlace()) out = exe.run(main_prog, feed={"x": input}, fetch_list=[out]) - self.assertEqual(np.array_equal(out[0], input * 2.0 + 3.0), True) + np.testing.assert_array_equal(out[0], input * 2.0 + 3.0) class TestScaleInplaceApiStatic(TestScaleApiStatic): @@ -204,7 +204,7 @@ def test_api(self): input = np.random.random([2, 25]).astype("float32") x = paddle.to_tensor(input) out = self._executed_api(x, scale=2.0, bias=3.0) - self.assertEqual(np.array_equal(out.numpy(), input * 2.0 + 3.0), True) + np.testing.assert_array_equal(out.numpy(), input * 2.0 + 3.0) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_scatter_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_scatter_op_mlu.py index d901813e3482a..cd0b14e0c800f 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_scatter_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_scatter_op_mlu.py @@ -212,7 +212,7 @@ def test_static_graph(): mlu_value = mlu_exe.run(feed=feed, fetch_list=fetch)[0] return mlu_value - self.assertTrue(np.array_equal(test_dygraph(), test_static_graph())) + np.testing.assert_allclose(test_dygraph(), test_static_graph()) class TestScatterOpFp16(OpTest): diff --git a/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py index a074a9d91a8bc..71116b4d3cebb 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py @@ -549,7 +549,7 @@ def test_starts_ends_is_tensor(self): ends=paddle.to_tensor(ends, dtype='int32')) a_2 = paddle.slice(a, axes=axes, starts=starts, ends=ends) - self.assertTrue(np.array_equal(a_1.numpy(), a_2.numpy())) + np.testing.assert_allclose(a_1.numpy(), a_2.numpy()) def test_bool_tensor(self): with paddle.fluid.dygraph.guard(): @@ -565,7 +565,7 @@ def test_bool_tensor(self): y_np = tt[0:3, 1:5, 2:4] self.assertTrue(paddle.bool == y_paddle.dtype) - self.assertTrue(np.array_equal(y_paddle.numpy(), y_np)) + np.testing.assert_array_equal(y_paddle.numpy(), y_np) class TestImperativeVarBaseGetItem(unittest.TestCase): @@ -620,11 +620,11 @@ def test_axis_less_than_zero(self): 100, ], [0], [1]) np_slice = x_arr[:, :, 0:1] - self.assertTrue(np.array_equal(pp_slice, np_slice)) + np.testing.assert_allclose(pp_slice, np_slice) pp_slice = paddle.slice(x, (-100, ), [0], [1]) np_slice = x_arr[0:1] - self.assertTrue(np.array_equal(pp_slice, np_slice)) + np.testing.assert_allclose(pp_slice, np_slice) x_arr = np.array([], dtype=np.float32) x = paddle.to_tensor(np.reshape(x_arr, (0, 0, 0))) diff --git a/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py index f112cd6f66fa2..25dbbbd028e6e 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py @@ -157,8 +157,8 @@ def test_mlu(self): cpu_pred, cpu_loss = self._test(False) mlu_pred, mlu_loss = self._test(True) - self.assertTrue(np.allclose(mlu_pred, cpu_pred)) - self.assertTrue(np.allclose(mlu_loss, cpu_loss)) + np.testing.assert_allclose(mlu_pred, cpu_pred, rtol=1e-5) + np.testing.assert_allclose(mlu_loss, cpu_loss) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py index 2728473f55088..5f283a6c157bf 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_split_op_mlu.py @@ -108,8 +108,8 @@ def test_out(self): input1 = np.random.random([1, 10]).astype('float32') r0, r1 = exe.run(feed={"data": input1}, fetch_list=[x0, x1]) ex_x0, ex_x1 = np.split(input1, (3, ), axis=1) - self.assertTrue(np.allclose(ex_x0, r0)) - self.assertTrue(np.allclose(ex_x1, r1)) + np.testing.assert_allclose(ex_x0, r0) + np.testing.assert_allclose(ex_x1, r1) class API_TestSplit2(unittest.TestCase): @@ -123,8 +123,8 @@ def test_out(self): input1 = np.random.random([1, 10]).astype('float32') r0, r1 = exe.run(feed={"data": input1}, fetch_list=[x0, x1]) ex_x0, ex_x1 = np.split(input1, 2, axis=1) - self.assertTrue(np.allclose(ex_x0, r0)) - self.assertTrue(np.allclose(ex_x1, r1)) + np.testing.assert_allclose(ex_x0, r0) + np.testing.assert_allclose(ex_x1, r1) class API_TestDygraphSplit(unittest.TestCase): @@ -139,9 +139,9 @@ def test_out1(self): x1_out = x1.numpy() x2_out = x2.numpy() ex_x0, ex_x1, ex_x2 = np.split(input_1, 3, axis=1) - self.assertTrue(np.allclose(ex_x0, x0_out)) - self.assertTrue(np.allclose(ex_x1, x1_out)) - self.assertTrue(np.allclose(ex_x2, x2_out)) + np.testing.assert_allclose(ex_x0, x0_out) + np.testing.assert_allclose(ex_x1, x1_out) + np.testing.assert_allclose(ex_x2, x2_out) def test_out2(self): with fluid.dygraph.guard(paddle.MLUPlace(0)): @@ -153,9 +153,9 @@ def test_out2(self): x1_out = x1.numpy() x2_out = x2.numpy() ex_x0, ex_x1, ex_x2 = np.split(input_1, (1, 3), axis=1) - self.assertTrue(np.allclose(ex_x0, x0_out)) - self.assertTrue(np.allclose(ex_x1, x1_out)) - self.assertTrue(np.allclose(ex_x2, x2_out)) + np.testing.assert_allclose(ex_x0, x0_out) + np.testing.assert_allclose(ex_x1, x1_out) + np.testing.assert_allclose(ex_x2, x2_out) # attr(axis) is Tensor diff --git a/python/paddle/fluid/tests/unittests/mlu/test_squared_l2_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_squared_l2_norm_op_mlu.py index 6a81c11c70b1b..7dc668dfe56f6 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_squared_l2_norm_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_squared_l2_norm_op_mlu.py @@ -59,7 +59,7 @@ def check_place(self, place): x = paddle.to_tensor(x_np) y1 = _C_ops.squared_l2_norm(x) y2 = _C_ops.squared_l2_norm(x) - self.assertTrue(np.array_equal(y1.numpy(), y2.numpy())) + np.testing.assert_allclose(y1.numpy(), y2.numpy()) def test_main(self): self.check_place(paddle.CPUPlace()) diff --git a/python/paddle/fluid/tests/unittests/mlu/test_stack_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_stack_op_mlu.py index b7dec57039452..573081f9fe0f7 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_stack_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_stack_op_mlu.py @@ -148,7 +148,7 @@ def test_out(self): }, fetch_list=[result_stack]) expected_result = np.stack([input1, input2, input3], axis=0) - self.assertTrue(np.allclose(expected_result, result)) + np.testing.assert_allclose(expected_result, result) def test_single_tensor_error(self): with fluid.program_guard(fluid.Program(), fluid.Program()): @@ -169,14 +169,14 @@ def test_out(self): result = paddle.stack([x1, x2, x3]) result_np = result.numpy() expected_result = np.stack([data1, data2, data3]) - self.assertTrue(np.allclose(expected_result, result_np)) + np.testing.assert_allclose(expected_result, result_np) with fluid.dygraph.guard(place=paddle.MLUPlace(0)): y1 = fluid.dygraph.to_variable(data1) result = paddle.stack([y1], axis=0) result_np_2 = result.numpy() expected_result_2 = np.stack([data1], axis=0) - self.assertTrue(np.allclose(expected_result_2, result_np_2)) + np.testing.assert_allclose(expected_result_2, result_np_2) def test_single_tensor_error(self): with fluid.dygraph.guard(place=paddle.MLUPlace(0)): diff --git a/python/paddle/fluid/tests/unittests/mlu/test_tanh_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_tanh_op_mlu.py index e1023a94bec5f..0b2d2ac86ff52 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_tanh_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_tanh_op_mlu.py @@ -145,8 +145,8 @@ def test_mlu(self): cpu_pred, cpu_loss = self._test(False) mlu_pred, mlu_loss = self._test(True) - self.assertTrue(np.allclose(mlu_pred, cpu_pred)) - self.assertTrue(np.allclose(mlu_loss, cpu_loss)) + np.testing.assert_allclose(mlu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(mlu_loss, cpu_loss) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/mlu/test_top_k_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_top_k_v2_op_mlu.py index 57081f1a54564..94cae5f355b1b 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_top_k_v2_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_top_k_v2_op_mlu.py @@ -189,43 +189,43 @@ def run_dygraph(self, place): # test case for basic test case 1 paddle_result = paddle.topk(input_tensor, k=2) numpy_result = numpy_topk(self.input_data, k=2) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + np.testing.assert_allclose(paddle_result[0].numpy(), numpy_result[0]) + np.testing.assert_allclose(paddle_result[1].numpy(), numpy_result[1]) # test case for basic test case 2 with axis paddle_result = paddle.topk(input_tensor, k=2, axis=1) numpy_result = numpy_topk(self.input_data, k=2, axis=1) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + np.testing.assert_allclose(paddle_result[0].numpy(), numpy_result[0]) + np.testing.assert_allclose(paddle_result[1].numpy(), numpy_result[1]) # test case for basic test case 3 with tensor K k_tensor = paddle.to_tensor(np.array([2])) paddle_result = paddle.topk(input_tensor, k=k_tensor, axis=1) numpy_result = numpy_topk(self.input_data, k=2, axis=1) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + np.testing.assert_allclose(paddle_result[0].numpy(), numpy_result[0]) + np.testing.assert_allclose(paddle_result[1].numpy(), numpy_result[1]) # test case for basic test case 4 with tensor largest k_tensor = paddle.to_tensor(np.array([2])) paddle_result = paddle.topk(input_tensor, k=2, axis=1, largest=False) numpy_result = numpy_topk(self.input_data, k=2, axis=1, largest=False) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + np.testing.assert_allclose(paddle_result[0].numpy(), numpy_result[0]) + np.testing.assert_allclose(paddle_result[1].numpy(), numpy_result[1]) # test case for basic test case 5 with axis -1 k_tensor = paddle.to_tensor(np.array([2])) paddle_result = paddle.topk(input_tensor, k=2, axis=-1, largest=False) numpy_result = numpy_topk(self.input_data, k=2, axis=-1, largest=False) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + np.testing.assert_allclose(paddle_result[0].numpy(), numpy_result[0]) + np.testing.assert_allclose(paddle_result[1].numpy(), numpy_result[1]) # test case for basic test case 6 for the partial sort paddle_result = paddle.topk(large_input_tensor, k=1, axis=-1) numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + np.testing.assert_allclose(paddle_result[0].numpy(), numpy_result[0]) + np.testing.assert_allclose(paddle_result[1].numpy(), numpy_result[1]) # test case for basic test case 7 for the unsorted paddle_result = paddle.topk(input_tensor, k=2, axis=1, sorted=False) sort_paddle = numpy_topk(np.array(paddle_result[0].numpy()), axis=1, k=2) numpy_result = numpy_topk(self.input_data, k=2, axis=1) - self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0])) + np.testing.assert_allclose(sort_paddle[0], numpy_result[0]) def run_static(self, place): paddle.enable_static() @@ -263,32 +263,32 @@ def run_static(self, place): result7[0], result7[1] ]) numpy_result = numpy_topk(self.input_data, k=2) - self.assertTrue(np.allclose(paddle_result[0], numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1], numpy_result[1])) + np.testing.assert_allclose(paddle_result[0], numpy_result[0]) + np.testing.assert_allclose(paddle_result[1], numpy_result[1]) numpy_result = numpy_topk(self.input_data, k=2, axis=-1) - self.assertTrue(np.allclose(paddle_result[2], numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[3], numpy_result[1])) + np.testing.assert_allclose(paddle_result[2], numpy_result[0]) + np.testing.assert_allclose(paddle_result[3], numpy_result[1]) numpy_result = numpy_topk(self.input_data, k=2, axis=1) - self.assertTrue(np.allclose(paddle_result[4], numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[5], numpy_result[1])) + np.testing.assert_allclose(paddle_result[4], numpy_result[0]) + np.testing.assert_allclose(paddle_result[5], numpy_result[1]) numpy_result = numpy_topk(self.input_data, k=2, axis=1, largest=False) - self.assertTrue(np.allclose(paddle_result[6], numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[7], numpy_result[1])) + np.testing.assert_allclose(paddle_result[6], numpy_result[0]) + np.testing.assert_allclose(paddle_result[7], numpy_result[1]) numpy_result = numpy_topk(self.input_data, k=2, axis=-1, largest=False) - self.assertTrue(np.allclose(paddle_result[8], numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[9], numpy_result[1])) + np.testing.assert_allclose(paddle_result[8], numpy_result[0]) + np.testing.assert_allclose(paddle_result[9], numpy_result[1]) numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1) - self.assertTrue(np.allclose(paddle_result[10], numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[11], numpy_result[1])) + np.testing.assert_allclose(paddle_result[10], numpy_result[0]) + np.testing.assert_allclose(paddle_result[11], numpy_result[1]) sort_paddle = numpy_topk(paddle_result[12], axis=1, k=2) numpy_result = numpy_topk(self.input_data, k=2, axis=1) - self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0])) + np.testing.assert_allclose(sort_paddle[0], numpy_result[0]) def test_cases(self): places = [core.CPUPlace()] diff --git a/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py index bcb41283de91e..db2a08a0312c1 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py @@ -356,13 +356,13 @@ def test_moveaxis1(self): exe = paddle.static.Executor() out_np = exe.run(feed={"x": x_np}, fetch_list=[out])[0] - self.assertEqual(np.array_equal(out_np, expected), True) + np.testing.assert_array_equal(out_np, expected) paddle.disable_static() x = paddle.to_tensor(x_np) out = paddle.moveaxis(x, [0, 4, 3, 2], [1, 3, 2, 0]) self.assertEqual(out.shape, [4, 2, 5, 7, 3]) - self.assertEqual(np.array_equal(out.numpy(), expected), True) + np.testing.assert_array_equal(out.numpy(), expected) paddle.enable_static() def test_moveaxis2(self): @@ -376,13 +376,13 @@ def test_moveaxis2(self): exe = paddle.static.Executor() out_np = exe.run(feed={"x": x_np}, fetch_list=[out])[0] - self.assertEqual(np.array_equal(out_np, expected), True) + np.testing.assert_array_equal(out_np, expected) paddle.disable_static() x = paddle.to_tensor(x_np) out = x.moveaxis(-2, -1) self.assertEqual(out.shape, [2, 5, 3]) - self.assertEqual(np.array_equal(out.numpy(), expected), True) + np.testing.assert_array_equal(out.numpy(), expected) paddle.enable_static() def test_error(self): diff --git a/python/paddle/fluid/tests/unittests/mlu/test_tril_triu_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_tril_triu_op_mlu.py index ad6359ed714d6..da6557beb680d 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_tril_triu_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_tril_triu_op_mlu.py @@ -154,8 +154,8 @@ def test_api(self): feed={"x": data}, fetch_list=[tril_out, triu_out], ) - self.assertTrue(np.allclose(tril_out, np.tril(data))) - self.assertTrue(np.allclose(triu_out, np.triu(data))) + np.testing.assert_allclose(tril_out, np.tril(data)) + np.testing.assert_allclose(triu_out, np.triu(data)) def test_api_with_dygraph(self): paddle.disable_static() @@ -167,8 +167,8 @@ def test_api_with_dygraph(self): x = fluid.dygraph.to_variable(data) tril_out, triu_out = tensor.tril(x).numpy(), tensor.triu( x).numpy() - self.assertTrue(np.allclose(tril_out, np.tril(data))) - self.assertTrue(np.allclose(triu_out, np.triu(data))) + np.testing.assert_allclose(tril_out, np.tril(data)) + np.testing.assert_allclose(triu_out, np.triu(data)) def test_fluid_api(self): paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_uniform_random_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_uniform_random_op_mlu.py index 70289853e8921..ca1b7b3e602b4 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_uniform_random_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_uniform_random_op_mlu.py @@ -71,8 +71,11 @@ def test_check_output(self): def verify_output(self, outs): hist, prob = self.output_hist(np.array(outs[0])) - self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01), - "hist: " + str(hist)) + np.testing.assert_allclose(hist, + prob, + rtol=0, + atol=0.01, + err_msg="hist: " + str(hist)) class TestMLUUniformRandomOpSelectedRows(unittest.TestCase): @@ -100,8 +103,11 @@ def check_with_place(self, place): op.run(scope, place) self.assertEqual(out.get_tensor().shape(), [1000, 784]) hist, prob = output_hist(np.array(out.get_tensor())) - self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01), - "hist: " + str(hist)) + np.testing.assert_allclose(hist, + prob, + rtol=0, + atol=0.01, + err_msg="hist: " + str(hist)) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/mlu/test_where_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_where_op_mlu.py index 3f1d553f7386e..a22a4b1b8c7eb 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_where_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_where_op_mlu.py @@ -288,7 +288,7 @@ def __test_where_with_broadcast_dygraph(self, cond_shape, a_shape, b_shape): result = paddle.where(cond, a, b) result = result.numpy() expect = np.where(cond, a, b) - self.assertTrue(np.array_equal(expect, result)) + np.testing.assert_array_equal(expect, result) def test_dygraph_api_broadcast_1(self): cond_shape = [2, 4] @@ -351,7 +351,7 @@ def test_where_condition(self): fetch_list=[z.name], return_numpy=False) expect_out = np.array([[0, 0], [1, 1]]) - self.assertTrue(np.allclose(expect_out, np.array(res))) + np.testing.assert_allclose(expect_out, np.array(res)) data = np.array([True, True, False]) with program_guard(Program(), Program()): x = fluid.layers.data(name='x', shape=[(-1)]) @@ -364,7 +364,7 @@ def test_where_condition(self): fetch_list=[z.name], return_numpy=False) expect_out = np.array([[0], [1]]) - self.assertTrue(np.allclose(expect_out, np.array(res))) + np.testing.assert_allclose(expect_out, np.array(res)) class TestWhereOpError(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py index 92cd3025b07e4..70ab75ef5f242 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py @@ -304,8 +304,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-3)) - self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-3)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-3) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-3) class TestNetWithEpsilonTensor(unittest.TestCase): @@ -447,9 +447,9 @@ def _test_with_place(self, place): preds.append(pred) losses.append(loss) for pred in preds: - self.assertTrue(np.allclose(pred, preds[0])) + np.testing.assert_allclose(pred, preds[0]) for loss in losses: - self.assertTrue(np.allclose(loss, losses[0])) + np.testing.assert_allclose(loss, losses[0]) def test_adam_api(self): # NOTE(zhiqiu): cpu and gpu has different seed, so should compare separatly. diff --git a/python/paddle/fluid/tests/unittests/npu/test_adamw_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adamw_op_npu.py index 8a0966339e871..579892dee3dce 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_adamw_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_adamw_op_npu.py @@ -249,8 +249,8 @@ def _test(self, run_npu=True): def test_npu(self): npu_pred, npu_loss = self._test(True) cpu_pred, cpu_loss = self._test(False) - self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-3)) - self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-3)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=5e-3) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=5e-3) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py index d67b10845799b..ee43d18ae2f01 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py @@ -95,7 +95,7 @@ def test_not_contains_nan_inf(self): out, found_inf = self.run_prog(a, b, scale) print(out, found_inf) - self.assertTrue(np.allclose(out, (a / b) / scale[0])) + np.testing.assert_allclose(out, (a / b) / scale[0]) self.assertFalse(found_inf[0]) @@ -159,7 +159,7 @@ def test_not_contains_nan_inf(self): out, found_inf = self.run_prog(a, b, scale) print(out, found_inf) - self.assertTrue(np.allclose(out, (a + b) / scale[0])) + np.testing.assert_allclose(out, (a + b) / scale[0]) self.assertFalse(found_inf[0]) diff --git a/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py index 808996d355fa0..581b0793af2cb 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py @@ -15,7 +15,7 @@ from __future__ import print_function import unittest -import numpy +import numpy as np import sys sys.path.append("..") @@ -27,7 +27,7 @@ import paddle.fluid.layers as layers paddle.enable_static() -numpy.random.seed(2021) +np.random.seed(2021) class TestAssignValueNPUOp(op_test.OpTest): @@ -50,7 +50,7 @@ def set_npu(self): self.__class__.use_npu = True def init_data(self): - self.value = numpy.random.random(size=(2, 5)).astype(numpy.float32) + self.value = np.random.random(size=(2, 5)).astype(np.float32) self.attrs["fp32_values"] = [float(v) for v in self.value.flat] def test_forward(self): @@ -60,22 +60,22 @@ def test_forward(self): class TestAssignValueNPUOp2(TestAssignValueNPUOp): def init_data(self): - self.value = numpy.random.random(size=(2, 5)).astype(numpy.int32) + self.value = np.random.random(size=(2, 5)).astype(np.int32) self.attrs["int32_values"] = [int(v) for v in self.value.flat] class TestAssignValueNPUOp3(TestAssignValueNPUOp): def init_data(self): - self.value = numpy.random.random(size=(2, 5)).astype(numpy.int64) + self.value = np.random.random(size=(2, 5)).astype(np.int64) self.attrs["int64_values"] = [int(v) for v in self.value.flat] class TestAssignValueNPUOp4(TestAssignValueNPUOp): def init_data(self): - self.value = numpy.random.choice(a=[False, True], - size=(2, 5)).astype(numpy.bool) + self.value = np.random.choice(a=[False, True], + size=(2, 5)).astype(np.bool) self.attrs["bool_values"] = [int(v) for v in self.value.flat] @@ -83,7 +83,7 @@ class TestAssignApi(unittest.TestCase): def setUp(self): self.init_dtype() - self.value = (-100 + 200 * numpy.random.random(size=(2, 5))).astype( + self.value = (-100 + 200 * np.random.random(size=(2, 5))).astype( self.dtype) self.place = fluid.NPUPlace( 0) if fluid.core.is_compiled_with_npu() else fluid.CPUPlace() @@ -99,8 +99,10 @@ def test_assign(self): exe = fluid.Executor(self.place) [fetched_x] = exe.run(main_program, feed={}, fetch_list=[x]) - self.assertTrue(numpy.array_equal(fetched_x, self.value), - "fetch_x=%s val=%s" % (fetched_x, self.value)) + np.testing.assert_allclose(fetched_x, + self.value, + err_msg="fetch_x=%s val=%s" % + (fetched_x, self.value)) self.assertEqual(fetched_x.dtype, self.value.dtype) @@ -120,8 +122,8 @@ class TestAssignApi4(TestAssignApi): def setUp(self): self.init_dtype() - self.value = numpy.random.choice(a=[False, True], - size=(2, 5)).astype(numpy.bool) + self.value = np.random.choice(a=[False, True], + size=(2, 5)).astype(np.bool) self.place = fluid.NPUPlace( 0) if fluid.core.is_compiled_with_npu() else fluid.CPUPlace() diff --git a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py index c6b7fada1fb39..6b76a1104cb48 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py @@ -40,7 +40,10 @@ def setUp(self): self.data_formats = ["NCHW", "NHWC"] def __assert_close(self, tensor, np_array, msg, atol=1e-4): - self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) + np.testing.assert_allclose(np.array(tensor), + np_array, + atol=atol, + err_msg=msg) def check_with_place(self, place, data_layout, dtype, shape): epsilon = epsilon = 0.00001 @@ -475,7 +478,7 @@ def compute(x, is_test, trainable_statistics): x = np.random.randn(*shape).astype("float32") y1 = compute(x, False, False) y2 = compute(x, True, True) - self.assertTrue(np.allclose(y1, y2)) + np.testing.assert_allclose(y1, y2, rtol=1e-5) def test_static(self): places = [fluid.NPUPlace(0)] @@ -498,7 +501,7 @@ def compute(x_np, is_test, trainable_statistics): x = np.random.randn(*shape).astype("float32") y1 = compute(x, False, False) y2 = compute(x, True, True) - self.assertTrue(np.allclose(y1, y2, atol=1e-5)) + np.testing.assert_allclose(y1, y2, atol=1e-5) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/npu/test_bce_loss_npu.py b/python/paddle/fluid/tests/unittests/npu/test_bce_loss_npu.py index b7a5cd2405e60..ccb9369e059cf 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_bce_loss_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_bce_loss_npu.py @@ -60,7 +60,7 @@ def test_static_layer(place, "weight": weight_np }, fetch_list=[res]) - return static_result + return static_result[0] def test_static_functional(place, @@ -100,7 +100,7 @@ def test_static_functional(place, "weight": weight_np }, fetch_list=[res]) - return static_result + return static_result[0] def test_dygraph_layer(place, @@ -178,16 +178,18 @@ def test_BCELoss(self): dy_result = test_dygraph_layer(place, input_np, label_np, reduction) expected = calc_bceloss(input_np, label_np, reduction) - self.assertTrue(np.allclose(static_result, expected)) - self.assertTrue(np.allclose(static_result, dy_result)) - self.assertTrue(np.allclose(dy_result, expected)) + np.testing.assert_allclose(static_result, expected, rtol=1e-6) + np.testing.assert_allclose(static_result, dy_result) + np.testing.assert_allclose(dy_result, expected, rtol=1e-6) static_functional = test_static_functional( place, input_np, label_np, reduction) dy_functional = test_dygraph_functional(place, input_np, label_np, reduction) - self.assertTrue(np.allclose(static_functional, expected)) - self.assertTrue(np.allclose(static_functional, dy_functional)) - self.assertTrue(np.allclose(dy_functional, expected)) + np.testing.assert_allclose(static_functional, + expected, + rtol=1e-6) + np.testing.assert_allclose(static_functional, dy_functional) + np.testing.assert_allclose(dy_functional, expected, rtol=1e-6) def test_BCELoss_weight(self): input_np = np.random.uniform(0.1, 0.8, @@ -212,9 +214,9 @@ def test_BCELoss_weight(self): label_np, reduction, weight_np=weight_np) - self.assertTrue(np.allclose(static_result, expected)) - self.assertTrue(np.allclose(static_result, dy_result)) - self.assertTrue(np.allclose(dy_result, expected)) + np.testing.assert_allclose(static_result, expected, rtol=1e-6) + np.testing.assert_allclose(static_result, dy_result, rtol=1e-6) + np.testing.assert_allclose(dy_result, expected, rtol=1e-6) static_functional = test_static_functional(place, input_np, label_np, @@ -225,9 +227,11 @@ def test_BCELoss_weight(self): label_np, reduction, weight_np=weight_np) - self.assertTrue(np.allclose(static_functional, expected)) - self.assertTrue(np.allclose(static_functional, dy_functional)) - self.assertTrue(np.allclose(dy_functional, expected)) + np.testing.assert_allclose(static_functional, expected, rtol=1e-6) + np.testing.assert_allclose(static_functional, + dy_functional, + rtol=1e-6) + np.testing.assert_allclose(dy_functional, expected, rtol=1e-6) def test_BCELoss_error(self): paddle.disable_static(paddle.NPUPlace(0)) diff --git a/python/paddle/fluid/tests/unittests/npu/test_beam_search_decode_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_beam_search_decode_op_npu.py index 0a45cec0d0c95..ca13b29620406 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_beam_search_decode_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_beam_search_decode_op_npu.py @@ -92,9 +92,8 @@ def test_get_set(self): expected_data = np.array( [0, 2, 3, 1, 0, 2, 1, 0, 4, 5, 3, 5, 0, 4, 5, 3, 1], "int64") - self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data)) - self.assertTrue(np.array_equal(np.array(sentence_scores), - expected_data)) + np.testing.assert_array_equal(np.array(sentence_ids), expected_data) + np.testing.assert_array_equal(np.array(sentence_scores), expected_data) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py index cf6af6462d061..a5162d85486ee 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py @@ -173,14 +173,14 @@ def test_clip(self): }, fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8]) - self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8))) - self.assertTrue(np.allclose(res2, data.clip(0.2, 0.9))) - self.assertTrue(np.allclose(res3, data.clip(min=0.3))) - self.assertTrue(np.allclose(res4, data.clip(max=0.7))) - self.assertTrue(np.allclose(res5, data.clip(min=0.2))) - self.assertTrue(np.allclose(res6, data.clip(max=0.8))) - self.assertTrue(np.allclose(res7, data.clip(max=-1))) - self.assertTrue(np.allclose(res8, data)) + np.testing.assert_allclose(res1, data.clip(0.2, 0.8)) + np.testing.assert_allclose(res2, data.clip(0.2, 0.9)) + np.testing.assert_allclose(res3, data.clip(min=0.3)) + np.testing.assert_allclose(res4, data.clip(max=0.7)) + np.testing.assert_allclose(res5, data.clip(min=0.2)) + np.testing.assert_allclose(res6, data.clip(max=0.8)) + np.testing.assert_allclose(res7, data.clip(max=-1)) + np.testing.assert_allclose(res8, data) paddle.disable_static() def test_clip_dygraph(self): @@ -200,9 +200,9 @@ def test_clip_dygraph(self): images = paddle.to_tensor(data, dtype='float32') out_3 = self._executed_api(images, min=v_min, max=v_max) - self.assertTrue(np.allclose(out_1.numpy(), data.clip(0.2, 0.8))) - self.assertTrue(np.allclose(out_2.numpy(), data.clip(0.2, 0.9))) - self.assertTrue(np.allclose(out_3.numpy(), data.clip(0.2, 0.8))) + np.testing.assert_allclose(out_1.numpy(), data.clip(0.2, 0.8)) + np.testing.assert_allclose(out_2.numpy(), data.clip(0.2, 0.9)) + np.testing.assert_allclose(out_3.numpy(), data.clip(0.2, 0.8)) def test_errors(self): paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py b/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py index 69f3b1bcbe41b..443faa9d794e4 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py @@ -216,5 +216,5 @@ def check_with_place(self, model_file, col_type, need_envs={}): if col_type == "identity": need_result1 = input1 need_result2 = input2 - self.assertTrue(np.allclose(tr0_out, need_result1, rtol=0, atol=0)) - self.assertTrue(np.allclose(tr1_out, need_result2, rtol=0, atol=0)) + np.testing.assert_allclose(tr0_out, need_result1, rtol=0, atol=0) + np.testing.assert_allclose(tr1_out, need_result2, rtol=0, atol=0) diff --git a/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py index 4fff3ab5fa059..c24b5bb165858 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py @@ -218,10 +218,8 @@ def _run_static_mode(self, use_fluid_api): self.assertTrue(self.out_var.shape[self.axis] == -1) exe = fluid.Executor(self.place) res = exe.run(self.program, fetch_list=self.out_var) - self.assertTrue( - np.array_equal( - res[0], np.concatenate([self.x] * self.iter_num, - axis=self.axis))) + np.testing.assert_allclose( + res[0], np.concatenate([self.x] * self.iter_num, axis=self.axis)) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py index 44baf7a547c00..ad163d8064272 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py @@ -142,8 +142,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-6) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py index 9cf22adbb7591..be51fff6a294b 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py @@ -33,15 +33,15 @@ def run_cases(self): y = paddle.cumsum(data) z = np.cumsum(data_np) - self.assertTrue(np.array_equal(z, y.numpy())) + np.testing.assert_array_equal(z, y.numpy()) y = paddle.cumsum(data, axis=0) z = np.cumsum(data_np, axis=0) - self.assertTrue(np.array_equal(z, y.numpy())) + np.testing.assert_array_equal(z, y.numpy()) y = paddle.cumsum(data, axis=-1) z = np.cumsum(data_np, axis=-1) - self.assertTrue(np.array_equal(z, y.numpy())) + np.testing.assert_array_equal(z, y.numpy()) y = paddle.cumsum(data, dtype='float32') self.assertTrue(y.dtype == core.VarDesc.VarType.FP32) @@ -51,7 +51,7 @@ def run_cases(self): y = paddle.cumsum(data, axis=-2) z = np.cumsum(data_np, axis=-2) - self.assertTrue(np.array_equal(z, y.numpy())) + np.testing.assert_array_equal(z, y.numpy()) def run_static(self, use_npu=False): with fluid.program_guard(fluid.Program()): @@ -74,15 +74,15 @@ def run_static(self, use_npu=False): ]) z = np.cumsum(data_np) - self.assertTrue(np.allclose(z, out[0])) + np.testing.assert_allclose(z, out[0]) z = np.cumsum(data_np, axis=0) - self.assertTrue(np.allclose(z, out[1])) + np.testing.assert_allclose(z, out[1]) z = np.cumsum(data_np, axis=-1) - self.assertTrue(np.allclose(z, out[2])) + np.testing.assert_allclose(z, out[2]) self.assertTrue(out[3].dtype == np.float32) self.assertTrue(out[4].dtype == np.int32) z = np.cumsum(data_np, axis=-2) - self.assertTrue(np.allclose(z, out[5])) + np.testing.assert_allclose(z, out[5]) def test_npu(self): # Now, npu tests need setting paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py index bca1d631c8e55..25b14eb46aee5 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py @@ -269,11 +269,11 @@ def check_static_result(self, place): fetches = exe.run(fluid.default_main_program(), feed={"input": in_np}, fetch_list=[res]) - self.assertTrue(np.allclose(fetches[0], res_np)) + np.testing.assert_allclose(fetches[0], res_np) fetches2 = exe.run(fluid.default_main_program(), feed={"input": in_np}, fetch_list=[res6]) - self.assertTrue(np.allclose(fetches2[0], res_np2)) + np.testing.assert_allclose(fetches2[0], res_np2) def test_static(self): for place in self.places: diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py index 9dcf4aa707ce5..bf066f3ea275b 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py @@ -178,8 +178,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-6) class TestFloatStatus(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py index 6d3683615978f..22e3c19e94ea1 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py @@ -326,8 +326,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred) + np.testing.assert_allclose(npu_loss, cpu_loss) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py index 2ddd7b4069d59..330191e51f93e 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py @@ -222,8 +222,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-6) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py index f197f9bd381c7..6ae9ff19d3dc1 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py @@ -328,8 +328,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-6) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py index 58ccc04a0f47a..bb9d6759c9ca2 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py @@ -231,8 +231,8 @@ def test_npu(self): npu_pred, npu_loss = self._test(True) cpu_pred, cpu_loos = self._test(False) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loos)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(npu_loss, cpu_loos, rtol=1e-6) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py index 5613afe18273e..6b7185ca01fbd 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py @@ -132,7 +132,7 @@ def test_npu(self): cpu_loss = self._test(False) npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-6) # ------------------------------------------------ diff --git a/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py index 5f33d7358161a..2aefdd4349639 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py @@ -282,7 +282,7 @@ def test_imperative(self): output = paddle.fluid.layers.gather(input, index) output_np = output.numpy() expected_output = np.array([3, 4]) - self.assertTrue(np.allclose(output_np, expected_output)) + np.testing.assert_allclose(output_np[0], expected_output) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py index 28b8ab9b25f93..635b43ad4dfc5 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py @@ -102,7 +102,7 @@ def test_out1(self): }, fetch_list=[out]) expected_output = np.array([[3, 4], [5, 6]]) - self.assertTrue(np.allclose(result, expected_output)) + np.testing.assert_allclose(result, expected_output, rtol=1e-5) def test_out2(self): with paddle.static.program_guard(paddle.static.Program(), @@ -120,7 +120,7 @@ def test_out2(self): }, fetch_list=[out]) expected_output = gather_numpy(x_np, index_np, axis=0) - self.assertTrue(np.allclose(result, expected_output)) + np.testing.assert_allclose(result, expected_output, rtol=1e-5) class TestGatherGrad(unittest.TestCase): @@ -174,8 +174,8 @@ def test_npu(self): npu_pred, npu_loss = self._test(True) cpu_pred, cpu_loss = self._test(False) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-5) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-5) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/npu/test_gaussian_random_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gaussian_random_op_npu.py index 470982b9e70eb..589b0f0c7078e 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_gaussian_random_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_gaussian_random_op_npu.py @@ -70,8 +70,12 @@ def verify_output(self, outs): hist2, _ = np.histogram(data, range=(-3, 5)) hist2 = hist2.astype("float32") hist2 /= float(outs[0].size) - self.assertTrue(np.allclose(hist, hist2, rtol=0, atol=0.01), - "hist: " + str(hist) + " hist2: " + str(hist2)) + np.testing.assert_allclose(hist, + hist2, + rtol=0, + atol=0.01, + err_msg="hist: " + str(hist) + " hist2: " + + str(hist2)) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py index a779e797808a0..a5ee86ba28b79 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py @@ -150,8 +150,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred, atol=1e-3)) - self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-3)) + np.testing.assert_allclose(npu_pred, cpu_pred, atol=1e-3) + np.testing.assert_allclose(npu_loss, cpu_loss, atol=1e-3) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py index a83618392a1d1..a41ad009ad974 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py @@ -113,7 +113,7 @@ def test_static_api(self): res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2]) out_ref = ref_hardsigmoid(self.x_np) for r in res: - self.assertTrue(np.allclose(out_ref, r)) + np.testing.assert_allclose(out_ref, r, rtol=1e-6) def test_dygraph_api(self): paddle.disable_static(self.place) @@ -123,7 +123,7 @@ def test_dygraph_api(self): out2 = m(x) out_ref = ref_hardsigmoid(self.x_np) for r in [out1, out2]: - self.assertTrue(np.allclose(out_ref, r.numpy())) + np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-6) paddle.enable_static() def test_fluid_api(self): @@ -133,12 +133,12 @@ def test_fluid_api(self): exe = fluid.Executor(self.place) res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) out_ref = ref_hardsigmoid(self.x_np, 0.2, 0.5) - self.assertTrue(np.allclose(out_ref, res[0])) + np.testing.assert_allclose(out_ref, res[0]) paddle.disable_static(self.place) x = paddle.to_tensor(self.x_np) out = paddle.fluid.layers.hard_sigmoid(x) - self.assertTrue(np.allclose(out_ref, out.numpy())) + np.testing.assert_allclose(out_ref, out.numpy()) paddle.enable_static() def test_errors(self): diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py index 4e83700da78a1..74b1fe9bbd3cb 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py @@ -115,16 +115,20 @@ def test_check_output_and_grad_npu(self): y = F.hardswish(data) y.sum().backward() - self.assertTrue( - np.allclose(self.out_y.numpy(), y.numpy()), - "Output of NPU HardSwish forward has diff at " + str(self.place) + - "\nExpect " + str(self.out_y) + "\n" + "But Got" + str(y) + - " in class " + self.__class__.__name__ + ".") - self.assertTrue( - np.allclose(self.out_g.numpy(), data.grad.numpy()), - "Output of NPU HardSwish backward has diff at " + str(self.place) + - "\nExpect " + str(self.out_g) + "\n" + "But Got" + str(data.grad) + - " in class " + self.__class__.__name__ + ".") + np.testing.assert_allclose( + self.out_y.numpy(), + y.numpy(), + err_msg="Output of NPU HardSwish forward has diff at " + + str(self.place) + "\nExpect " + str(self.out_y) + "\n" + "But Got" + + str(y) + " in class " + self.__class__.__name__ + ".", + rtol=1e-5) + np.testing.assert_allclose( + self.out_g.numpy(), + data.grad.numpy(), + err_msg="Output of NPU HardSwish backward has diff at " + + str(self.place) + "\nExpect " + str(self.out_g) + "\n" + "But Got" + + str(data.grad) + " in class " + self.__class__.__name__ + ".", + rtol=1e-5) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_index_select_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_index_select_op_npu.py index 5428bf1e6571f..69c762a9ae0f5 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_index_select_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_index_select_op_npu.py @@ -133,7 +133,7 @@ def test_index_select_api(self): return_numpy=False) expect_out = np.array([[1.0, 2.0, 2.0], [5.0, 6.0, 6.0], [9.0, 10.0, 10.0]]).astype('float32') - self.assertTrue(np.allclose(expect_out, np.array(res))) + np.testing.assert_allclose(expect_out, np.array(res)) # case 2: with program_guard(Program(), Program()): @@ -149,7 +149,7 @@ def test_index_select_api(self): return_numpy=False) expect_out = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [5.0, 6.0, 7.0, 8.0]]).astype('float32') - self.assertTrue(np.allclose(expect_out, np.array(res))) + np.testing.assert_allclose(expect_out, np.array(res)) def test_dygraph_index_select_api(self): paddle.set_device("npu:0") @@ -163,7 +163,7 @@ def test_dygraph_index_select_api(self): np_z = z.numpy() expect_out = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [5.0, 6.0, 7.0, 8.0]]).astype('float32') - self.assertTrue(np.allclose(expect_out, np_z)) + np.testing.assert_allclose(expect_out, np_z) # case 2: x = paddle.to_tensor(self.data_x) @@ -172,7 +172,7 @@ def test_dygraph_index_select_api(self): np_z = z.numpy() expect_out = np.array([[1.0, 2.0, 2.0], [5.0, 6.0, 6.0], [9.0, 10.0, 10.0]]).astype('float32') - self.assertTrue(np.allclose(expect_out, np_z)) + np.testing.assert_allclose(expect_out, np_z) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_kldiv_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_kldiv_loss_op_npu.py index 3d9ba6c440779..200b99f67553a 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_kldiv_loss_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_kldiv_loss_op_npu.py @@ -117,7 +117,7 @@ def run_kl_loss(self, reduction, shape=(5, 20)): kldiv_criterion = paddle.nn.KLDivLoss(reduction) pred_loss = kldiv_criterion(paddle.to_tensor(x), paddle.to_tensor(target)) - self.assertTrue(np.allclose(pred_loss.numpy(), gt_loss)) + np.testing.assert_allclose(pred_loss.numpy(), gt_loss, rtol=1e-6) def test_kl_loss_batchmean(self): self.run_kl_loss('batchmean') diff --git a/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py index 5295ed50555be..408554ae13fc8 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py @@ -53,10 +53,11 @@ def init_dtype(self): self.atol = 1e-4 def __assert_close(self, tensor, np_array, msg, atol=1e-4): - self.assertTrue( - np.allclose(np.array(tensor).astype(np_array.dtype), - np_array, - atol=atol), msg) + np.testing.assert_allclose(np.array(tensor).astype( + np_array.dtype).reshape(np_array.shape), + np_array, + atol=atol, + err_msg=msg) def check_forward_backward(self, shape, diff --git a/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py index d285d82f9d99a..922325087101f 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py @@ -145,8 +145,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-6) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py index e6724a28354ca..7a50b1a198981 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py @@ -142,8 +142,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred, atol=1e-4)) - self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-4)) + np.testing.assert_allclose(npu_pred, cpu_pred, atol=1e-4) + np.testing.assert_allclose(npu_loss, cpu_loss, atol=1e-4) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py index 8971f888b6574..ef98f73875907 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py @@ -128,13 +128,13 @@ def check_api(self, axis=-1): y = logsoftmax(x) exe = paddle.static.Executor(self.place) out = exe.run(feed={'x': self.x}, fetch_list=[y]) - self.assertTrue(np.allclose(out[0], ref_out)) + np.testing.assert_allclose(out[0], ref_out, rtol=1e-6) # test dygrapg api paddle.disable_static(self.place) x = paddle.to_tensor(self.x) y = logsoftmax(x) - self.assertTrue(np.allclose(y.numpy(), ref_out)) + np.testing.assert_allclose(y.numpy(), ref_out, rtol=1e-6) paddle.enable_static() def test_check_api(self): @@ -161,12 +161,12 @@ def check_api(self, axis=-1, dtype=None): y = F.log_softmax(x, axis, dtype) exe = paddle.static.Executor(self.place) out = exe.run(feed={'x': self.x}, fetch_list=[y]) - self.assertTrue(np.allclose(out[0], ref_out)) + np.testing.assert_allclose(out[0], ref_out, rtol=1e-6) paddle.disable_static(self.place) x = paddle.to_tensor(self.x) y = F.log_softmax(x, axis, dtype) - self.assertTrue(np.allclose(y.numpy(), ref_out), True) + np.testing.assert_allclose(y.numpy(), ref_out, rtol=1e-6) paddle.enable_static() def test_check_api(self): diff --git a/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py index d11d83f47cce2..d6ed4980041fe 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py @@ -76,8 +76,8 @@ def test_npu_cpoy_to_cpu(self): npu_, cpu_ = exe.run(main_program, feed={}, fetch_list=[npu_var.name, cpu_var.name]) - self.assertTrue(np.allclose(npu_, cpu_)) - self.assertTrue(np.allclose(cpu_, np.ones((10, 10)))) + np.testing.assert_allclose(npu_, cpu_) + np.testing.assert_allclose(cpu_, np.ones((10, 10))) def test_cpu_cpoy_npu(self): main_program, npu_var, cpu_var = self.get_prog() @@ -90,8 +90,8 @@ def test_cpu_cpoy_npu(self): npu_, cpu_ = exe.run(main_program, feed={}, fetch_list=[npu_var.name, cpu_var.name]) - self.assertTrue(np.allclose(npu_, cpu_)) - self.assertTrue(np.allclose(npu_, np.zeros((10, 10)))) + np.testing.assert_allclose(npu_, cpu_) + np.testing.assert_allclose(npu_, np.zeros((10, 10))) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_merged_momentum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_merged_momentum_op_npu.py index dce642cc0634e..086911a56dbae 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_merged_momentum_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_merged_momentum_op_npu.py @@ -316,7 +316,7 @@ def run_op(use_merged): outs2 = run_op(False) self.assertEqual(len(outs1), len(outs2)) for i, (out1, out2) in enumerate(zip(outs1, outs2)): - self.assertTrue(np.allclose(out1, out2, atol=1e-7)) + np.testing.assert_allclose(out1, out2, atol=1e-7) def test_main(self): self.check_with_place(self.place, multi_precision=False) @@ -370,13 +370,13 @@ def run_op(use_nesterov, use_merged): outs2 = run_op(use_nesterov=True, use_merged=False) self.assertEqual(len(outs1), len(outs2)) for i, (out1, out2) in enumerate(zip(outs1, outs2)): - self.assertTrue(np.allclose(out1, out2, atol=1e-7)) + np.testing.assert_allclose(out1, out2, atol=1e-7) outs3 = run_op(use_nesterov=False, use_merged=True) outs4 = run_op(use_nesterov=False, use_merged=False) self.assertEqual(len(outs3), len(outs4)) for j, (out3, out4) in enumerate(zip(outs3, outs4)): - self.assertTrue(np.allclose(out3, out4, atol=1e-7)) + np.testing.assert_allclose(out3, out4, atol=1e-7) def test_main(self): self.check_with_place(self.place, multi_precision=False) diff --git a/python/paddle/fluid/tests/unittests/npu/test_meshgrid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_meshgrid_op_npu.py index a4d388d2ed4f4..4350ee6bd1a3c 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_meshgrid_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_meshgrid_op_npu.py @@ -123,8 +123,8 @@ def test_api(self): }, fetch_list=[grid_x, grid_y]) - self.assertTrue(np.allclose(res_1, out_1)) - self.assertTrue(np.allclose(res_2, out_2)) + np.testing.assert_allclose(res_1, out_1) + np.testing.assert_allclose(res_2, out_2) class TestMeshgridOp4(unittest.TestCase): @@ -154,8 +154,8 @@ def test_list_input(self): }, fetch_list=[grid_x, grid_y]) - self.assertTrue(np.allclose(res_1, out_1)) - self.assertTrue(np.allclose(res_2, out_2)) + np.testing.assert_allclose(res_1, out_1) + np.testing.assert_allclose(res_2, out_2) class TestMeshgridOp5(unittest.TestCase): @@ -185,8 +185,8 @@ def test_tuple_input(self): }, fetch_list=[grid_x, grid_y]) - self.assertTrue(np.allclose(res_1, out_1)) - self.assertTrue(np.allclose(res_2, out_2)) + np.testing.assert_allclose(res_1, out_1) + np.testing.assert_allclose(res_2, out_2) class TestMeshgridOp6(unittest.TestCase): @@ -209,8 +209,8 @@ def test_api_with_dygraph(self): tensor_4 = paddle.to_tensor(input_4) res_3, res_4 = paddle.tensor.meshgrid(tensor_3, tensor_4) - self.assertTrue(np.allclose(res_3.numpy(), out_3)) - self.assertTrue(np.allclose(res_4.numpy(), out_4)) + np.testing.assert_allclose(res_3.numpy(), out_3) + np.testing.assert_allclose(res_4.numpy(), out_4) paddle.enable_static() @@ -234,8 +234,8 @@ def test_api_with_dygraph_list_input(self): tensor_4 = paddle.to_tensor(input_4) res_3, res_4 = paddle.meshgrid([tensor_3, tensor_4]) - self.assertTrue(np.allclose(res_3.numpy(), out_3)) - self.assertTrue(np.allclose(res_4.numpy(), out_4)) + np.testing.assert_allclose(res_3.numpy(), out_3) + np.testing.assert_allclose(res_4.numpy(), out_4) paddle.enable_static() @@ -259,8 +259,8 @@ def test_api_with_dygraph_tuple_input(self): tensor_4 = paddle.to_tensor(input_4) res_3, res_4 = paddle.tensor.meshgrid((tensor_3, tensor_4)) - self.assertTrue(np.allclose(res_3.numpy(), out_3)) - self.assertTrue(np.allclose(res_4.numpy(), out_4)) + np.testing.assert_allclose(res_3.numpy(), out_3) + np.testing.assert_allclose(res_4.numpy(), out_4) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py index c4adebcda6ff6..eda9e5800ebd1 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py @@ -285,8 +285,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-6) class TestMulNet3_2(unittest.TestCase): @@ -358,9 +358,9 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred, - atol=1e-5)) # atol needed on cann 20.3 - self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-5)) + np.testing.assert_allclose(npu_pred, cpu_pred, + atol=1e-5) # atol needed on cann 20.3 + np.testing.assert_allclose(npu_loss, cpu_loss, atol=1e-5) class TestMulNet3_2_xc2(unittest.TestCase): @@ -433,8 +433,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-6) class TestMulNet4_2(unittest.TestCase): @@ -509,9 +509,9 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred, - atol=1e-5)) # atol needed on cann 20.3 - self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-5)) + np.testing.assert_allclose(npu_pred, cpu_pred, + atol=1e-5) # atol needed on cann 20.3 + np.testing.assert_allclose(npu_loss, cpu_loss, atol=1e-5) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_multinomial_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_multinomial_op_npu.py index 036e6a0a7f957..8feca4805bccd 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_multinomial_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_multinomial_op_npu.py @@ -77,9 +77,12 @@ def verify_output(self, outs): # normalize the input to get the probability prob = self.input_np / self.input_np.sum(axis=-1, keepdims=True) sample_prob = self.sample_output(np.array(outs[0])) - self.assertTrue( - np.allclose(sample_prob, prob, rtol=0, atol=0.01), - "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob)) + np.testing.assert_allclose(sample_prob, + prob, + rtol=0, + atol=0.01, + err_msg="sample_prob: " + str(sample_prob) + + "\nprob: " + str(prob)) class TestMultinomialOp2(TestMultinomialOp): @@ -122,9 +125,12 @@ def test_dygraph(self): sample_prob = sample_output_one_dimension(out.numpy(), 4) prob = x_numpy / x_numpy.sum(axis=-1, keepdims=True) - self.assertTrue( - np.allclose(sample_prob, prob, rtol=0, atol=0.01), - "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob)) + np.testing.assert_allclose(sample_prob, + prob, + rtol=0, + atol=0.01, + err_msg="sample_prob: " + str(sample_prob) + + "\nprob: " + str(prob)) paddle.enable_static() def test_dygraph2(self): @@ -137,9 +143,12 @@ def test_dygraph2(self): sample_prob = sample_output_two_dimension(out.numpy(), [3, 4]) prob = x_numpy / x_numpy.sum(axis=-1, keepdims=True) - self.assertTrue( - np.allclose(sample_prob, prob, rtol=0, atol=0.01), - "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob)) + np.testing.assert_allclose(sample_prob, + prob, + rtol=0, + atol=0.01, + err_msg="sample_prob: " + str(sample_prob) + + "\nprob: " + str(prob)) paddle.enable_static() def test_dygraph3(self): @@ -182,9 +191,12 @@ def test_static(self): sample_prob = sample_output_one_dimension(out, 4) prob = x_np / x_np.sum(axis=-1, keepdims=True) - self.assertTrue( - np.allclose(sample_prob, prob, rtol=0, atol=0.01), - "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob)) + np.testing.assert_allclose(sample_prob, + prob, + rtol=0, + atol=0.01, + err_msg="sample_prob: " + str(sample_prob) + + "\nprob: " + str(prob)) class TestMultinomialAlias(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_op_npu.py index c17b8461bd17f..17c21f7bc0daa 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_op_npu.py @@ -466,10 +466,10 @@ def test_case(self): out_h=12, out_w=12, align_corners=False) - self.assertTrue( - np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 1)))) + np.testing.assert_allclose(results[0], + np.transpose(expect_res, (0, 2, 3, 1))) for i in range(len(results) - 1): - self.assertTrue(np.allclose(results[i + 1], expect_res)) + np.testing.assert_allclose(results[i + 1], expect_res) class TestNearestInterpException(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py index 5c5a05383889c..ee44a3a301205 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py @@ -398,7 +398,7 @@ def test_case(self): scale_factor=scale, mode="nearest", align_corners=False) - self.assertTrue(np.allclose(out.numpy(), expect_res)) + np.testing.assert_allclose(out.numpy(), expect_res) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/npu/test_pad3d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pad3d_op_npu.py index 12ade62af4d98..f388e66881c87 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_pad3d_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_pad3d_op_npu.py @@ -202,8 +202,8 @@ def test_static(self): mode, value, data_format="NDHWC") - self.assertTrue(np.allclose(fetches[0], np_out1)) - self.assertTrue(np.allclose(fetches[1], np_out2)) + np.testing.assert_allclose(fetches[0], np_out1) + np.testing.assert_allclose(fetches[1], np_out2) def test_dygraph_1(self): paddle.disable_static() @@ -238,8 +238,8 @@ def test_dygraph_1(self): value=value, data_format="NDHWC") - self.assertTrue(np.allclose(y1.numpy(), np_out1)) - self.assertTrue(np.allclose(y2.numpy(), np_out2)) + np.testing.assert_allclose(y1.numpy(), np_out1) + np.testing.assert_allclose(y2.numpy(), np_out2) def test_dygraph_2(self): paddle.disable_static() @@ -274,8 +274,8 @@ def test_dygraph_2(self): value=value, data_format="NHWC") - self.assertTrue(np.allclose(y1.numpy(), np_out1)) - self.assertTrue(np.allclose(y2.numpy(), np_out2)) + np.testing.assert_allclose(y1.numpy(), np_out1) + np.testing.assert_allclose(y2.numpy(), np_out2) def test_dygraph_3(self): paddle.disable_static() @@ -310,8 +310,8 @@ def test_dygraph_3(self): value=value, data_format="NLC") - self.assertTrue(np.allclose(y1.numpy(), np_out1)) - self.assertTrue(np.allclose(y2.numpy(), np_out2)) + np.testing.assert_allclose(y1.numpy(), np_out1) + np.testing.assert_allclose(y2.numpy(), np_out2) class TestPad1dAPI(unittest.TestCase): @@ -360,14 +360,14 @@ def test_class(self): "constant", value=value, data_format="NCL") - self.assertTrue(np.allclose(output.numpy(), np_out)) + np.testing.assert_allclose(output.numpy(), np_out) output = pad_constant_int(data) np_out = self._get_numpy_out(input_data, [pad_int] * 2, "constant", value=value, data_format="NCL") - self.assertTrue(np.allclose(output.numpy(), np_out)) + np.testing.assert_allclose(output.numpy(), np_out) class TestPad2dAPI(unittest.TestCase): @@ -418,14 +418,14 @@ def test_class(self): "constant", value=value, data_format="NCHW") - self.assertTrue(np.allclose(output.numpy(), np_out)) + np.testing.assert_allclose(output.numpy(), np_out) output = pad_constant_int(data) np_out = self._get_numpy_out(input_data, [pad_int] * 4, "constant", value=value, data_format="NCHW") - self.assertTrue(np.allclose(output.numpy(), np_out)) + np.testing.assert_allclose(output.numpy(), np_out) class TestPad3dAPI(unittest.TestCase): @@ -478,14 +478,14 @@ def test_class(self): "constant", value=value, data_format="NCDHW") - self.assertTrue(np.allclose(output.numpy(), np_out)) + np.testing.assert_allclose(output.numpy(), np_out) output = pad_constant_int(data) np_out = self._get_numpy_out(input_data, [pad_int] * 6, "constant", value=value, data_format="NCDHW") - self.assertTrue(np.allclose(output.numpy(), np_out)) + np.testing.assert_allclose(output.numpy(), np_out) class TestPad3dOpNpuError(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py index 6274ba53781ae..a0938f17e7d91 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py @@ -142,8 +142,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-6) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py index 632defd7f0ede..5781c93964f2c 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py @@ -150,8 +150,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred) + np.testing.assert_allclose(npu_loss, cpu_loss) class TestReduceSumNet2(TestReduceSumNet): diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py index 1bf503a37799a..3eb78bd923e60 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py @@ -163,8 +163,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-6) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py index f5f95deffba18..f8d2b1f7114a3 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py @@ -157,8 +157,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-6) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_rmsprop_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_rmsprop_op_npu.py index d71c1453c33f9..53bbbbd0978e6 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_rmsprop_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_rmsprop_op_npu.py @@ -88,8 +88,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-3)) - self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-3)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-3) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-3) class TestCenteredNet(unittest.TestCase): @@ -151,8 +151,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-3)) - self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-3)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-3) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-3) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py index 1a3d0b1dbdff7..03c48b415dd4c 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py @@ -112,8 +112,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-6) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py index e0ad94361ad4c..f5a322f932c5b 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py @@ -316,8 +316,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-6) class TestSliceOpDecsDim(OpTest): diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py index ada6e0f5f5384..c46bcb2bebdb1 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py @@ -118,8 +118,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-2)) - self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-2)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-2) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-2) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py index f6f3d746d8089..d24c02bddac9c 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py @@ -155,8 +155,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-5) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-5) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_split_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_split_op_npu.py index 3a06e0566d4dc..a922c851b2f9f 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_split_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_split_op_npu.py @@ -113,8 +113,8 @@ def test_out(self): input1 = np.random.random([1, 10]).astype('float32') r0, r1 = exe.run(feed={"data": input1}, fetch_list=[x0, x1]) ex_x0, ex_x1 = np.split(input1, (3, ), axis=1) - self.assertTrue(np.allclose(ex_x0, r0)) - self.assertTrue(np.allclose(ex_x1, r1)) + np.testing.assert_allclose(ex_x0, r0) + np.testing.assert_allclose(ex_x1, r1) class API_TestSplit2(unittest.TestCase): @@ -128,8 +128,8 @@ def test_out(self): input1 = np.random.random([1, 10]).astype('float32') r0, r1 = exe.run(feed={"data": input1}, fetch_list=[x0, x1]) ex_x0, ex_x1 = np.split(input1, 2, axis=1) - self.assertTrue(np.allclose(ex_x0, r0)) - self.assertTrue(np.allclose(ex_x1, r1)) + np.testing.assert_allclose(ex_x0, r0) + np.testing.assert_allclose(ex_x1, r1) class API_TestDygraphSplit(unittest.TestCase): @@ -144,9 +144,9 @@ def test_out1(self): x1_out = x1.numpy() x2_out = x2.numpy() ex_x0, ex_x1, ex_x2 = np.split(input_1, 3, axis=1) - self.assertTrue(np.allclose(ex_x0, x0_out)) - self.assertTrue(np.allclose(ex_x1, x1_out)) - self.assertTrue(np.allclose(ex_x2, x2_out)) + np.testing.assert_allclose(ex_x0, x0_out) + np.testing.assert_allclose(ex_x1, x1_out) + np.testing.assert_allclose(ex_x2, x2_out) def test_out2(self): with fluid.dygraph.guard(paddle.NPUPlace(0)): @@ -158,9 +158,9 @@ def test_out2(self): x1_out = x1.numpy() x2_out = x2.numpy() ex_x0, ex_x1, ex_x2 = np.split(input_1, (1, 3), axis=1) - self.assertTrue(np.allclose(ex_x0, x0_out)) - self.assertTrue(np.allclose(ex_x1, x1_out)) - self.assertTrue(np.allclose(ex_x2, x2_out)) + np.testing.assert_allclose(ex_x0, x0_out) + np.testing.assert_allclose(ex_x1, x1_out) + np.testing.assert_allclose(ex_x2, x2_out) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py index 0ac775135e3b6..1d2bd13b35e48 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py @@ -145,8 +145,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-6) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py index 49dd0c94eb07d..3cc0296d8322e 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py @@ -142,8 +142,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-6) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_squeeze_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_squeeze_op_npu.py index 827fb0344d84b..2661c1a1f81c8 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_squeeze_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_squeeze_op_npu.py @@ -143,7 +143,7 @@ def test_out(self): result, = exe.run(feed={"data1": input1}, fetch_list=[result_squeeze]) expected_result = np.squeeze(input1, axis=1) - self.assertTrue(np.allclose(expected_result, result)) + np.testing.assert_allclose(expected_result, result) class API_TestStaticSqueeze_(API_TestSqueeze): @@ -168,7 +168,7 @@ def test_out(self): output = self.squeeze(input, axis=[1]) out_np = output.numpy() expected_out = np.squeeze(input_1, axis=1) - self.assertTrue(np.allclose(expected_out, out_np)) + np.testing.assert_allclose(expected_out, out_np) def test_out_int8(self): paddle.disable_static() @@ -178,7 +178,7 @@ def test_out_int8(self): output = self.squeeze(input, axis=[1]) out_np = output.numpy() expected_out = np.squeeze(input_1, axis=1) - self.assertTrue(np.allclose(expected_out, out_np)) + np.testing.assert_allclose(expected_out, out_np) def test_out_uint8(self): paddle.disable_static() @@ -188,7 +188,7 @@ def test_out_uint8(self): output = self.squeeze(input, axis=[1]) out_np = output.numpy() expected_out = np.squeeze(input_1, axis=1) - self.assertTrue(np.allclose(expected_out, out_np)) + np.testing.assert_allclose(expected_out, out_np) def test_axis_not_list(self): paddle.disable_static() @@ -198,7 +198,7 @@ def test_axis_not_list(self): output = self.squeeze(input, axis=1) out_np = output.numpy() expected_out = np.squeeze(input_1, axis=1) - self.assertTrue(np.allclose(expected_out, out_np)) + np.testing.assert_allclose(expected_out, out_np) def test_dimension_not_1(self): paddle.disable_static() @@ -208,7 +208,7 @@ def test_dimension_not_1(self): output = self.squeeze(input, axis=(1, 0)) out_np = output.numpy() expected_out = np.squeeze(input_1, axis=1) - self.assertTrue(np.allclose(expected_out, out_np)) + np.testing.assert_allclose(expected_out, out_np) class API_TestDygraphSqueezeInplace(API_TestDygraphSqueeze): diff --git a/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py index ae20f642a2802..e33747403e343 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py @@ -157,9 +157,8 @@ def test_case(self): self.assertTrue(self.out_var.shape[self.axis] == -1) exe = fluid.Executor(self.place) res = exe.run(self.program, fetch_list=self.out_var) - self.assertTrue( - np.array_equal(res[0], - np.stack([self.x] * self.iter_num, axis=self.axis))) + np.testing.assert_allclose( + res[0], np.stack([self.x] * self.iter_num, axis=self.axis)) class TestTensorStackAPIWithLoDTensorArray(unittest.TestCase): @@ -192,9 +191,8 @@ def test_case(self): self.assertTrue(self.out_var.shape[self.axis] == -1) exe = fluid.Executor(self.place) res = exe.run(self.program, fetch_list=self.out_var) - self.assertTrue( - np.array_equal(res[0], - np.stack([self.x] * self.iter_num, axis=self.axis))) + np.testing.assert_allclose( + res[0], np.stack([self.x] * self.iter_num, axis=self.axis)) class API_test(unittest.TestCase): @@ -217,7 +215,7 @@ def test_out(self): }, fetch_list=[result_stack]) expected_result = np.stack([input1, input2, input3], axis=0) - self.assertTrue(np.allclose(expected_result, result)) + np.testing.assert_allclose(expected_result, result) def test_single_tensor_error(self): with fluid.program_guard(fluid.Program(), fluid.Program()): @@ -238,14 +236,14 @@ def test_out(self): result = paddle.stack([x1, x2, x3]) result_np = result.numpy() expected_result = np.stack([data1, data2, data3]) - self.assertTrue(np.allclose(expected_result, result_np)) + np.testing.assert_allclose(expected_result, result_np) with fluid.dygraph.guard(place=paddle.NPUPlace(0)): y1 = fluid.dygraph.to_variable(data1) result = paddle.stack([y1], axis=0) result_np_2 = result.numpy() expected_result_2 = np.stack([data1], axis=0) - self.assertTrue(np.allclose(expected_result_2, result_np_2)) + np.testing.assert_allclose(expected_result_2, result_np_2) def test_single_tensor_error(self): with fluid.dygraph.guard(place=paddle.NPUPlace(0)): diff --git a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py index e26f713f00f9d..72e6137e5dc21 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py @@ -145,8 +145,8 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + np.testing.assert_allclose(npu_pred, cpu_pred, rtol=1e-6) + np.testing.assert_allclose(npu_loss, cpu_loss, rtol=1e-6) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py index 86a58cfae097b..e5009ff4e80dc 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py @@ -241,47 +241,47 @@ def run_dygraph(self, place): # test case for basic test case 1 paddle_result = paddle.topk(input_tensor, k=2) numpy_result = numpy_topk(self.input_data, k=2) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + np.testing.assert_allclose(paddle_result[0].numpy(), numpy_result[0]) + np.testing.assert_allclose(paddle_result[1].numpy(), numpy_result[1]) # test case for basic test case 2 with axis paddle_result = paddle.topk(input_tensor, k=2, axis=1) numpy_result = numpy_topk(self.input_data, k=2, axis=1) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + np.testing.assert_allclose(paddle_result[0].numpy(), numpy_result[0]) + np.testing.assert_allclose(paddle_result[1].numpy(), numpy_result[1]) # test case for basic test case 3 with tensor K k_tensor = paddle.to_tensor(np.array([2])) paddle_result = paddle.topk(input_tensor, k=k_tensor, axis=1) numpy_result = numpy_topk(self.input_data, k=2, axis=1) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + np.testing.assert_allclose(paddle_result[0].numpy(), numpy_result[0]) + np.testing.assert_allclose(paddle_result[1].numpy(), numpy_result[1]) # test case for basic test case 4 with tensor largest k_tensor = paddle.to_tensor(np.array([2])) paddle_result = paddle.topk(input_tensor, k=2, axis=1, largest=False) numpy_result = numpy_topk(self.input_data, k=2, axis=1, largest=False) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + np.testing.assert_allclose(paddle_result[0].numpy(), numpy_result[0]) + np.testing.assert_allclose(paddle_result[1].numpy(), numpy_result[1]) # test case for basic test case 5 with axis -1 k_tensor = paddle.to_tensor(np.array([2])) paddle_result = paddle.topk(input_tensor, k=2, axis=-1, largest=False) numpy_result = numpy_topk(self.input_data, k=2, axis=-1, largest=False) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + np.testing.assert_allclose(paddle_result[0].numpy(), numpy_result[0]) + np.testing.assert_allclose(paddle_result[1].numpy(), numpy_result[1]) # test case for basic test case 6 for the partial sort paddle_result = paddle.topk(large_input_tensor, k=1, axis=-1) numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1) - self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + np.testing.assert_allclose(paddle_result[0].numpy(), numpy_result[0]) + np.testing.assert_allclose(paddle_result[1].numpy(), numpy_result[1]) # test case for basic test case 7 for the unsorted paddle_result = paddle.topk(input_tensor, k=2, axis=1, sorted=False) sort_paddle = numpy_topk(np.array(paddle_result[0].numpy()), axis=1, k=2) numpy_result = numpy_topk(self.input_data, k=2, axis=1) - self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0])) + np.testing.assert_allclose(sort_paddle[0], numpy_result[0]) def run_static(self, place): paddle.enable_static() @@ -319,37 +319,37 @@ def run_static(self, place): result7[0], result7[1] ]) numpy_result = numpy_topk(self.input_data, k=2) - self.assertTrue(np.allclose(paddle_result[0], numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[1], numpy_result[1])) + np.testing.assert_allclose(paddle_result[0], numpy_result[0]) + np.testing.assert_allclose(paddle_result[1], numpy_result[1]) numpy_result = numpy_topk(self.input_data, k=2, axis=-1) - self.assertTrue(np.allclose(paddle_result[2], numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[3], numpy_result[1])) + np.testing.assert_allclose(paddle_result[2], numpy_result[0]) + np.testing.assert_allclose(paddle_result[3], numpy_result[1]) numpy_result = numpy_topk(self.input_data, k=2, axis=1) - self.assertTrue(np.allclose(paddle_result[4], numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[5], numpy_result[1])) + np.testing.assert_allclose(paddle_result[4], numpy_result[0]) + np.testing.assert_allclose(paddle_result[5], numpy_result[1]) numpy_result = numpy_topk(self.input_data, k=2, axis=1, largest=False) - self.assertTrue(np.allclose(paddle_result[6], numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[7], numpy_result[1])) + np.testing.assert_allclose(paddle_result[6], numpy_result[0]) + np.testing.assert_allclose(paddle_result[7], numpy_result[1]) numpy_result = numpy_topk(self.input_data, k=2, axis=-1, largest=False) - self.assertTrue(np.allclose(paddle_result[8], numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[9], numpy_result[1])) + np.testing.assert_allclose(paddle_result[8], numpy_result[0]) + np.testing.assert_allclose(paddle_result[9], numpy_result[1]) numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1) - self.assertTrue(np.allclose(paddle_result[10], numpy_result[0])) - self.assertTrue(np.allclose(paddle_result[11], numpy_result[1])) + np.testing.assert_allclose(paddle_result[10], numpy_result[0]) + np.testing.assert_allclose(paddle_result[11], numpy_result[1]) sort_paddle = numpy_topk(paddle_result[12], axis=1, k=2) numpy_result = numpy_topk(self.input_data, k=2, axis=1) - self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0])) + np.testing.assert_allclose(sort_paddle[0], numpy_result[0]) def test_cases(self): places = [core.NPUPlace(0)] diff --git a/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py index bdc68d43a2241..f62c5b47d5ab0 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py @@ -157,8 +157,8 @@ def test_api(self): feed={"x": data}, fetch_list=[tril_out, triu_out], ) - self.assertTrue(np.allclose(tril_out, np.tril(data))) - self.assertTrue(np.allclose(triu_out, np.triu(data))) + np.testing.assert_allclose(tril_out, np.tril(data)) + np.testing.assert_allclose(triu_out, np.triu(data)) def test_api_with_dygraph(self): paddle.disable_static(fluid.NPUPlace(0)) @@ -170,8 +170,8 @@ def test_api_with_dygraph(self): x = fluid.dygraph.to_variable(data) tril_out, triu_out = tensor.tril(x).numpy(), tensor.triu( x).numpy() - self.assertTrue(np.allclose(tril_out, np.tril(data))) - self.assertTrue(np.allclose(triu_out, np.triu(data))) + np.testing.assert_allclose(tril_out, np.tril(data)) + np.testing.assert_allclose(triu_out, np.triu(data)) def test_fluid_api(self): paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py index 0ce6deb42e097..2eef9b9083254 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py @@ -66,7 +66,7 @@ def test_npu(self): cpu_w = self._test(False) npu_w = self._test(True) - self.assertTrue(np.allclose(npu_w, cpu_w)) + np.testing.assert_allclose(npu_w, cpu_w) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_uniform_random_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_uniform_random_op_npu.py index 7f2c2753b9b98..30d4b82c7f42e 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_uniform_random_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_uniform_random_op_npu.py @@ -71,8 +71,11 @@ def test_check_output(self): def verify_output(self, outs): hist, prob = self.output_hist(np.array(outs[0])) - self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01), - "hist: " + str(hist)) + np.testing.assert_allclose(hist, + prob, + rtol=0, + atol=0.01, + err_msg="hist: " + str(hist)) class TestNPUUniformRandomOpSelectedRows(unittest.TestCase): @@ -100,8 +103,11 @@ def check_with_place(self, place): op.run(scope, place) self.assertEqual(out.get_tensor().shape(), [1000, 784]) hist, prob = output_hist(np.array(out.get_tensor())) - self.assertTrue(np.allclose(hist, prob, rtol=0, atol=0.01), - "hist: " + str(hist)) + np.testing.assert_allclose(hist, + prob, + rtol=0, + atol=0.01, + err_msg="hist: " + str(hist)) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 160b4e2e6857d..dad503660eafe 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -1048,12 +1048,13 @@ def _compare_expect_and_actual_outputs(self, str(expect_out) + "\n" + "But Got" + str(actual_out) + " in class " + self.__class__.__name__) else: - self.assertTrue( - np.array_equal(expect_out, actual_out), - "Output (" + name + ") has diff at " + str(place) + - " when using and not using inplace" + "\nExpect " + - str(expect_out) + "\n" + "But Got" + str(actual_out) + - " in class " + self.__class__.__name__ + '\n') + np.testing.assert_array_equal( + expect_out, + actual_out, + err_msg='Output (' + name + ') has diff at ' + str(place) + + ' when using and not using inplace' + '\nExpect ' + + str(expect_out) + '\n' + 'But Got' + str(actual_out) + + ' in class ' + self.__class__.__name__ + '\n') def _construct_grad_program_from_forward(self, fwd_program, grad_op_desc, op_grad_to_var): @@ -1457,7 +1458,6 @@ def compare_single_output_with_expect(self, name, expect): # see details: https://stackoverflow.com/questions/38331703/why-does-numpys-broadcasting-sometimes-allow-comparing-arrays-of-different-leng if expect_np.size == 0: self.op_test.assertTrue(actual_np.size == 0) # }}} - # print("actual_np, expect_np", actual_np, expect_np) self._compare_numpy(name, actual_np, expect_np) if isinstance(expect, tuple): self._compare_list(name, actual, expect) diff --git a/python/paddle/fluid/tests/unittests/test_activation_sparse_op.py b/python/paddle/fluid/tests/unittests/test_activation_sparse_op.py index cbc32bbc4a1e6..9c6a9412518ad 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_sparse_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_sparse_op.py @@ -51,7 +51,7 @@ def check_with_place(self, place): # get and compare result result_array = np.array(out_selected_rows.get_tensor()) - self.assertTrue(np.array_equal(result_array, np.square(np_array))) + np.testing.assert_array_equal(result_array, np.square(np_array)) def test_sparse_acti(self): places = [core.CPUPlace()] diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py index d90ae197783b1..116924544fc61 100644 --- a/python/paddle/fluid/tests/unittests/test_assign_op.py +++ b/python/paddle/fluid/tests/unittests/test_assign_op.py @@ -214,9 +214,9 @@ def test_clone(self): y = clone_x**3 y.backward() - self.assertTrue(np.array_equal(x, [1, 1]), True) - self.assertTrue(np.array_equal(clone_x.grad.numpy(), [3, 3]), True) - self.assertTrue(np.array_equal(x.grad.numpy(), [3, 3]), True) + np.testing.assert_array_equal(x, [1, 1]) + np.testing.assert_array_equal(clone_x.grad.numpy(), [3, 3]) + np.testing.assert_array_equal(x.grad.numpy(), [3, 3]) fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) paddle.enable_static() @@ -229,7 +229,7 @@ def test_clone(self): feed={'X': x_np}, fetch_list=[clone_x])[0] - self.assertTrue(np.array_equal(y_np, x_np), True) + np.testing.assert_array_equal(y_np, x_np) paddle.disable_static() diff --git a/python/paddle/fluid/tests/unittests/test_assign_value_op.py b/python/paddle/fluid/tests/unittests/test_assign_value_op.py index 681e1893b02e0..6c6c26a8c6a2b 100644 --- a/python/paddle/fluid/tests/unittests/test_assign_value_op.py +++ b/python/paddle/fluid/tests/unittests/test_assign_value_op.py @@ -15,7 +15,7 @@ from __future__ import print_function import unittest -import numpy +import numpy as np import op_test import paddle @@ -39,7 +39,7 @@ def setUp(self): self.outputs = {"Out": self.value} def init_data(self): - self.value = numpy.random.random(size=(2, 5)).astype(numpy.float32) + self.value = np.random.random(size=(2, 5)).astype(np.float32) self.attrs["fp32_values"] = [float(v) for v in self.value.flat] def test_forward(self): @@ -49,22 +49,22 @@ def test_forward(self): class TestAssignValueOp2(TestAssignValueOp): def init_data(self): - self.value = numpy.random.random(size=(2, 5)).astype(numpy.int32) + self.value = np.random.random(size=(2, 5)).astype(np.int32) self.attrs["int32_values"] = [int(v) for v in self.value.flat] class TestAssignValueOp3(TestAssignValueOp): def init_data(self): - self.value = numpy.random.random(size=(2, 5)).astype(numpy.int64) + self.value = np.random.random(size=(2, 5)).astype(np.int64) self.attrs["int64_values"] = [int(v) for v in self.value.flat] class TestAssignValueOp4(TestAssignValueOp): def init_data(self): - self.value = numpy.random.choice(a=[False, True], - size=(2, 5)).astype(numpy.bool) + self.value = np.random.choice(a=[False, True], + size=(2, 5)).astype(np.bool) self.attrs["bool_values"] = [int(v) for v in self.value.flat] @@ -72,7 +72,7 @@ class TestAssignApi(unittest.TestCase): def setUp(self): self.init_dtype() - self.value = (-100 + 200 * numpy.random.random(size=(2, 5))).astype( + self.value = (-100 + 200 * np.random.random(size=(2, 5))).astype( self.dtype) self.place = fluid.CUDAPlace( 0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace() @@ -88,8 +88,10 @@ def test_assign(self): exe = fluid.Executor(self.place) [fetched_x] = exe.run(main_program, feed={}, fetch_list=[x]) - self.assertTrue(numpy.array_equal(fetched_x, self.value), - "fetch_x=%s val=%s" % (fetched_x, self.value)) + np.testing.assert_array_equal(fetched_x, + self.value, + err_msg='fetch_x=%s val=%s' % + (fetched_x, self.value)) self.assertEqual(fetched_x.dtype, self.value.dtype) @@ -109,8 +111,8 @@ class TestAssignApi4(TestAssignApi): def setUp(self): self.init_dtype() - self.value = numpy.random.choice(a=[False, True], - size=(2, 5)).astype(numpy.bool) + self.value = np.random.choice(a=[False, True], + size=(2, 5)).astype(np.bool) self.place = fluid.CUDAPlace( 0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace() diff --git a/python/paddle/fluid/tests/unittests/test_attribute_var.py b/python/paddle/fluid/tests/unittests/test_attribute_var.py new file mode 100644 index 0000000000000..a79a8d400a363 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_attribute_var.py @@ -0,0 +1,158 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import tempfile +import paddle +import paddle.inference as paddle_infer +from paddle.fluid.framework import program_guard, Program +import numpy as np + +paddle.enable_static() + + +class UnittestBase(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.init_info() + + def tearDwon(self): + self.temp_dir.cleanup() + + def init_info(self): + self.shapes = None + self.save_path = None + + def infer_prog(self): + config = paddle_infer.Config(self.save_path + '.pdmodel', + self.save_path + '.pdiparams') + predictor = paddle_infer.create_predictor(config) + input_names = predictor.get_input_names() + for i, shape in enumerate(self.shapes): + input_handle = predictor.get_input_handle(input_names[i]) + fake_input = np.random.randn(*shape).astype("float32") + input_handle.reshape(shape) + input_handle.copy_from_cpu(fake_input) + predictor.run() + output_names = predictor.get_output_names() + output_handle = predictor.get_output_handle(output_names[0]) + output_data = output_handle.copy_to_cpu() + + return output_data + + +class TestDropout(UnittestBase): + + def init_info(self): + self.shapes = [[10, 10]] + self.save_path = os.path.join(self.temp_dir.name, 'dropout') + + def test_static(self): + main_prog = Program() + starup_prog = Program() + with program_guard(main_prog, starup_prog): + fc = paddle.nn.Linear(10, 10) + x = paddle.randn(self.shapes[0]) + x.stop_gradient = False + feat = fc(x) + # p is a Variable + p = paddle.randn([1]) + out = paddle.nn.functional.dropout(feat, p=p) + sgd = paddle.optimizer.SGD() + sgd.minimize(paddle.mean(out)) + # test _to_string + self.assertTrue("Var[" in str(main_prog)) + + exe = paddle.static.Executor() + exe.run(starup_prog) + res = exe.run(fetch_list=[x, out]) + # export model + paddle.static.save_inference_model(self.save_path, [x], [out], exe) + + # Test for Inference Predictor + infer_out = self.infer_prog() + self.assertEqual(infer_out.shape, (10, 10)) + + +class TestTileTensorList(UnittestBase): + + def init_info(self): + self.shapes = [[2, 3, 4]] + self.save_path = os.path.join(self.temp_dir.name, 'tile_tensors') + + def test_static(self): + main_prog = Program() + starup_prog = Program() + with program_guard(main_prog, starup_prog): + fc = paddle.nn.Linear(4, 10) + x = paddle.randn([2, 3, 4]) + x.stop_gradient = False + feat = fc(x) + shape0 = paddle.full([1], 1, dtype='int32') + shape1 = paddle.full([1], 2, dtype='int32') + shape = [3, shape1, shape0] + out = paddle.tile(feat, shape) + + sgd = paddle.optimizer.SGD() + sgd.minimize(paddle.mean(out)) + self.assertTrue("Vars[" in str(main_prog)) + + exe = paddle.static.Executor() + exe.run(starup_prog) + res = exe.run(fetch_list=[x, out]) + self.assertEqual(res[1].shape, (6, 6, 10)) + + paddle.static.save_inference_model(self.save_path, [x], [out], exe) + # Test for Inference Predictor + infer_out = self.infer_prog() + self.assertEqual(infer_out.shape, (6, 6, 10)) + + +class TestTileTensor(UnittestBase): + + def init_info(self): + self.shapes = [[2, 3, 4]] + self.save_path = os.path.join(self.temp_dir.name, 'tile_tensor') + + def test_static(self): + main_prog = Program() + starup_prog = Program() + with program_guard(main_prog, starup_prog): + fc = paddle.nn.Linear(4, 10) + x = paddle.randn([2, 3, 4]) + x.stop_gradient = False + feat = fc(x) + # shape is a Variable + shape = paddle.assign([3, 2, 1]) + out = paddle.tile(feat, shape) + + sgd = paddle.optimizer.SGD() + sgd.minimize(paddle.mean(out)) + self.assertTrue("Var[" in str(main_prog)) + + exe = paddle.static.Executor() + exe.run(starup_prog) + res = exe.run(fetch_list=[x, out]) + self.assertEqual(res[1].shape, (6, 6, 10)) + + paddle.static.save_inference_model(self.save_path, [x], [out], exe) + # Test for Inference Predictor + infer_out = self.infer_prog() + self.assertEqual(infer_out.shape, (6, 6, 10)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py index 7544ff4571cce..d6d613225d73f 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py @@ -76,6 +76,14 @@ def forward(self, input): out = self.linear0(out) out = F.gelu(out, approximate=True) out = self.linear1(out) + param = paddle.fluid.layers.create_parameter([1024, 4096], + paddle.float32) + auto.shard_tensor(param, + dist_attr={ + "process_mesh": PP_MESH_1, + "dims_mapping": [-1, 1] + }) + out = paddle.fluid.layers.mul(out, param) return out diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py index dfb314796a9ff..5c699881c21dc 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py @@ -93,6 +93,14 @@ def forward(self, input): }) w_out = self.word_embeddings(input) out = self.linear0(w_out) + param = paddle.fluid.layers.create_parameter([4096, 4096], + paddle.float32) + auto.shard_tensor(param, + dist_attr={ + "process_mesh": PP_MESH_0, + "dims_mapping": [0, -1] + }) + out = paddle.fluid.layers.mul(out, param) gelu_out = F.gelu(out, approximate=True) out = self.linear1(gelu_out) out1 = self.linear2(gelu_out) @@ -228,7 +236,7 @@ def test_mlp_mppp(self): resharder = Resharder(dist_main_prog, dist_startup_prog, rank_id, dist_context, dist_params_grads) resharder.reshard() - + print_program_with_dist_attr(dist_main_prog, dist_context) # check send and recv result self.assertTrue(check_send_recv_result(dist_main_prog, rank_id)) diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py index dbfb1844fb0c0..bf21d29caa0bf 100644 --- a/python/paddle/fluid/tests/unittests/test_base_layer.py +++ b/python/paddle/fluid/tests/unittests/test_base_layer.py @@ -371,7 +371,7 @@ def test_buffer_state_dict(self): self.func_test_buffer_state_dict() def assert_var_base_equal(self, var1, var2): - self.assertTrue(np.array_equal(var1.numpy(), var2.numpy())) + np.testing.assert_array_equal(var1.numpy(), var2.numpy()) class BufferNetWithModification(paddle.nn.Layer): @@ -414,8 +414,8 @@ def func_test_modified(self): st_outs = self._run(True) for i in range(len(dy_outs)): - self.assertTrue( - np.array_equal(dy_outs[i].numpy(), st_outs[i].numpy())) + np.testing.assert_array_equal(dy_outs[i].numpy(), + st_outs[i].numpy()) def test_modified(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py index b3206e385f498..59a3ff34a02a9 100644 --- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py +++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py @@ -91,9 +91,8 @@ def test_get_set(self): expected_data = np.array( [0, 2, 3, 1, 0, 2, 1, 0, 4, 5, 3, 5, 0, 4, 5, 3, 1], "int64") - self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data)) - self.assertTrue(np.array_equal(np.array(sentence_scores), - expected_data)) + np.testing.assert_array_equal(np.array(sentence_ids), expected_data) + np.testing.assert_array_equal(np.array(sentence_scores), expected_data) @unittest.skipIf(not core.is_compiled_with_cuda(), diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py index 4982ed451cd8c..8ab6968eb2140 100644 --- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py +++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py @@ -90,7 +90,7 @@ def test_fixed_random_number(self): self.assertEqual(np.sum(index1), 8582429431) self.assertEqual(np.sum(index2), 8581445798) expect = [0., 0., 0., 0., 0., 0., 0., 1., 1., 1.] - self.assertTrue(np.array_equal(y[16, 500, 500:510], expect)) + np.testing.assert_array_equal(y[16, 500, 500:510], expect) x = paddle.to_tensor(x_np, dtype='float32') y = paddle.bernoulli(x).numpy() @@ -99,7 +99,7 @@ def test_fixed_random_number(self): self.assertEqual(np.sum(index1), 8583509076) self.assertEqual(np.sum(index2), 8582778540) expect = [0., 0., 1., 1., 1., 1., 0., 1., 1., 1.] - self.assertTrue(np.array_equal(y[16, 500, 500:510], expect)) + np.testing.assert_array_equal(y[16, 500, 500:510], expect) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py index ffc173184728e..f88f0506ddd26 100644 --- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py +++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py @@ -118,9 +118,11 @@ def check_single_card_fetch_var(self): fetch_val2, = exe.run(compiled_prog, feed=feed_dict, fetch_list=[fetch_var]) - self.assertTrue( - np.array_equal(fetch_val1, fetch_val2), - "error var name: {}, fetch_val1: {}, fetch_val2: {}" + np.testing.assert_array_equal( + fetch_val1, + fetch_val2, + err_msg= + 'error var name: {}, fetch_val1: {}, fetch_val2: {}' .format( fetch_var, fetch_val1[~np.equal(fetch_val1, fetch_val2)], @@ -167,13 +169,14 @@ def check_multi_card_fetch_var(self): fetch_vals.append(fetch_val) for item in fetch_vals: - self.assertTrue(np.array_equal(fetch_vals[0], item)) - self.assertTrue( - np.array_equal(fetch_vals[0], item), - "error var name: {}, fetch_vals[0]: {}, item: {}". - format(fetch_var, - fetch_vals[0][~np.equal(fetch_vals[0], item)], - item[~np.equal(fetch_vals[0], item)])) + np.testing.assert_array_equal(fetch_vals[0], item) + np.testing.assert_array_equal( + fetch_vals[0], + item, + err_msg='error var name: {}, fetch_vals[0]: {}, item: {}' + .format(fetch_var, + fetch_vals[0][~np.equal(fetch_vals[0], item)], + item[~np.equal(fetch_vals[0], item)])) class CUDAInplaceTest(InplaceTestBase): diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py index 92eb35896255d..f45263bab3c90 100644 --- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py @@ -100,7 +100,7 @@ def test_prune(self): out = exe.run(main, feed={'x': np.ones([3]).astype('float32')}, fetch_list=[x1_grad]) - self.assertTrue(np.array_equal(out[0], [2., 0., 0.])) + np.testing.assert_array_equal(out[0], [2.0, 0.0, 0.0]) class TestDoubleGradient(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_cast_op.py b/python/paddle/fluid/tests/unittests/test_cast_op.py index 6e9c9bcd147f1..cd67440990cb8 100644 --- a/python/paddle/fluid/tests/unittests/test_cast_op.py +++ b/python/paddle/fluid/tests/unittests/test_cast_op.py @@ -130,11 +130,10 @@ def test_eager(self): x = paddle.ones([2, 2], dtype="float16") x.stop_gradient = False out = paddle.cast(x, "float32") - self.assertTrue( - np.array_equal(out, - np.ones([2, 2]).astype("float32"))) + np.testing.assert_array_equal(out, + np.ones([2, 2]).astype('float32')) out.backward() - self.assertTrue(np.array_equal(x.gradient(), x.numpy())) + np.testing.assert_array_equal(x.gradient(), x.numpy()) self.assertTrue(x.gradient().dtype == np.float16) diff --git a/python/paddle/fluid/tests/unittests/test_class_center_sample_op.py b/python/paddle/fluid/tests/unittests/test_class_center_sample_op.py index 492dae47f2acb..736d31b01845d 100644 --- a/python/paddle/fluid/tests/unittests/test_class_center_sample_op.py +++ b/python/paddle/fluid/tests/unittests/test_class_center_sample_op.py @@ -56,10 +56,27 @@ def class_center_sample_numpy(label, classes_list, num_samples): return np.array(remapped_label), np.array(pos_class_center_per_device) +def python_api( + label, + num_classes=1, + num_samples=1, + ring_id=0, + rank=0, + nranks=0, + fix_seed=False, + seed=0, +): + return paddle.nn.functional.class_center_sample(label, + num_classes=num_classes, + num_samples=num_samples, + group=None) + + class TestClassCenterSampleOp(OpTest): def initParams(self): self.op_type = "class_center_sample" + self.python_api = python_api self.batch_size = 20 self.num_samples = 6 self.num_classes = 10 @@ -96,7 +113,8 @@ def setUp(self): } def test_check_output(self): - self.check_output(no_check_set=['SampledLocalClassCenter']) + self.check_output(no_check_set=['SampledLocalClassCenter'], + check_eager=True) class TestClassCenterSampleOpINT32(TestClassCenterSampleOp): diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py index a893b65f5a421..731eedfca60a0 100755 --- a/python/paddle/fluid/tests/unittests/test_compare_op.py +++ b/python/paddle/fluid/tests/unittests/test_compare_op.py @@ -62,9 +62,11 @@ def test_errors(self): globals()[cls_name] = Cls -for _type_name in {'float32', 'float64', 'int32', 'int64'}: +for _type_name in {'float32', 'float64', 'int32', 'int64', 'float16'}: if _type_name == 'float64' and core.is_compiled_with_rocm(): _type_name = 'float32' + if _type_name == 'float16' and (not core.is_compiled_with_cuda()): + continue create_test_class('less_than', _type_name, lambda _a, _b: _a < _b) create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b) @@ -148,6 +150,102 @@ def test_dynamic_api_float(self): self.assertEqual((out.numpy() == self.real_result).all(), True) paddle.enable_static() + def test_dynamic_api_inf_1(self): + if self.op_type == "equal": + paddle.disable_static() + x1 = np.array([1, float('inf'), float('inf')]).astype(np.int64) + x = paddle.to_tensor(x1) + y1 = np.array([1, float('-inf'), float('inf')]).astype(np.int64) + y = paddle.to_tensor(y1) + op = eval("paddle.%s" % (self.op_type)) + out = op(x, y) + self.real_result = (x1 == y1).astype(np.int64) + self.assertEqual( + (out.numpy().astype(np.int64) == self.real_result).all(), + True) + paddle.enable_static() + + def test_dynamic_api_inf_2(self): + if self.op_type == "equal": + paddle.disable_static() + x1 = np.array([1, float('inf'), + float('inf')]).astype(np.float32) + x = paddle.to_tensor(x1) + y1 = np.array([1, float('-inf'), + float('inf')]).astype(np.float32) + y = paddle.to_tensor(y1) + op = eval("paddle.%s" % (self.op_type)) + out = op(x, y) + self.real_result = (x1 == y1).astype(np.int64) + self.assertEqual( + (out.numpy().astype(np.int64) == self.real_result).all(), + True) + paddle.enable_static() + + def test_dynamic_api_inf_3(self): + if self.op_type == "equal": + paddle.disable_static() + x1 = np.array([1, float('inf'), + float('-inf')]).astype(np.float32) + x = paddle.to_tensor(x1) + y1 = np.array([1, 2, 3]).astype(np.float32) + y = paddle.to_tensor(y1) + op = eval("paddle.%s" % (self.op_type)) + out = op(x, y) + self.real_result = (x1 == y1).astype(np.int64) + self.assertEqual( + (out.numpy().astype(np.int64) == self.real_result).all(), + True) + paddle.enable_static() + + def test_dynamic_api_nan_1(self): + if self.op_type == "equal": + paddle.disable_static() + x1 = np.array([1, float('nan'), float('nan')]).astype(np.int64) + x = paddle.to_tensor(x1) + y1 = np.array([1, float('-nan'), float('nan')]).astype(np.int64) + y = paddle.to_tensor(y1) + op = eval("paddle.%s" % (self.op_type)) + out = op(x, y) + self.real_result = (x1 == y1).astype(np.int64) + self.assertEqual( + (out.numpy().astype(np.int64) == self.real_result).all(), + True) + paddle.enable_static() + + def test_dynamic_api_nan_2(self): + if self.op_type == "equal": + paddle.disable_static() + x1 = np.array([1, float('nan'), + float('nan')]).astype(np.float32) + x = paddle.to_tensor(x1) + y1 = np.array([1, float('-nan'), + float('nan')]).astype(np.float32) + y = paddle.to_tensor(y1) + op = eval("paddle.%s" % (self.op_type)) + out = op(x, y) + self.real_result = (x1 == y1).astype(np.int64) + self.assertEqual( + (out.numpy().astype(np.int64) == self.real_result).all(), + True) + paddle.enable_static() + + def test_dynamic_api_nan_3(self): + if self.op_type == "equal": + paddle.disable_static() + x1 = np.array([1, float('-nan'), + float('nan')]).astype(np.float32) + x = paddle.to_tensor(x1) + y1 = np.array([1, 2, 1]).astype(np.float32) + y = paddle.to_tensor(y1) + op = eval("paddle.%s" % (self.op_type)) + out = op(x, y) + self.real_result = (x1 == y1).astype(np.int64) + self.assertEqual( + (out.numpy().astype(np.int64) == self.real_result).all(), + True) + paddle.enable_static() + def test_not_equal(self): if self.op_type == "not_equal": paddle.disable_static() diff --git a/python/paddle/fluid/tests/unittests/test_compiled_program.py b/python/paddle/fluid/tests/unittests/test_compiled_program.py index fab70b2c6ada4..1ca2034625b10 100644 --- a/python/paddle/fluid/tests/unittests/test_compiled_program.py +++ b/python/paddle/fluid/tests/unittests/test_compiled_program.py @@ -68,7 +68,7 @@ def test_compiled_program_base(self): "label": self.label }, fetch_list=[loss.name]) - self.assertTrue(np.array_equal(loss_data[0], self.loss)) + np.testing.assert_array_equal(loss_data[0], self.loss) def test_compiled_program_with_data_parallel(self): with new_program_scope(): @@ -90,7 +90,7 @@ def test_compiled_program_with_data_parallel(self): "label": self.label }, fetch_list=[loss.name]) - self.assertTrue(np.array_equal(loss_data[0], self.loss)) + np.testing.assert_array_equal(loss_data[0], self.loss) class TestCompiledProgramError(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py index 130a7e8833b4a..0bf3d6230d84f 100644 --- a/python/paddle/fluid/tests/unittests/test_concat_op.py +++ b/python/paddle/fluid/tests/unittests/test_concat_op.py @@ -447,10 +447,8 @@ def _run_static_mode(self, use_fluid_api): self.assertTrue(self.out_var.shape[self.axis] == -1) exe = fluid.Executor(self.place) res = exe.run(self.program, fetch_list=self.out_var) - self.assertTrue( - np.array_equal( - res[0], np.concatenate([self.x] * self.iter_num, - axis=self.axis))) + np.testing.assert_array_equal( + res[0], np.concatenate([self.x] * self.iter_num, axis=self.axis)) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_conj_op.py b/python/paddle/fluid/tests/unittests/test_conj_op.py index a3b3f24326034..d1b8de82bee6b 100644 --- a/python/paddle/fluid/tests/unittests/test_conj_op.py +++ b/python/paddle/fluid/tests/unittests/test_conj_op.py @@ -84,7 +84,7 @@ def test_conj_api(self): var_x = paddle.to_tensor(input) result = paddle.conj(var_x).numpy() target = np.conj(input) - self.assertTrue(np.array_equal(result, target)) + np.testing.assert_array_equal(result, target) def test_conj_operator(self): for dtype in self._dtypes: @@ -96,7 +96,7 @@ def test_conj_operator(self): var_x = paddle.to_tensor(input) result = var_x.conj().numpy() target = np.conj(input) - self.assertTrue(np.array_equal(result, target)) + np.testing.assert_array_equal(result, target) def test_conj_static_mode(self): @@ -118,7 +118,7 @@ def init_input_output(dtype): exe = static.Executor(place) out_value = exe.run(feed=input_dict, fetch_list=[out.name]) - self.assertTrue(np.array_equal(np_res, out_value[0])) + np.testing.assert_array_equal(np_res, out_value[0]) def test_conj_api_real_number(self): for dtype in self._dtypes: @@ -128,7 +128,7 @@ def test_conj_api_real_number(self): var_x = paddle.to_tensor(input) result = paddle.conj(var_x).numpy() target = np.conj(input) - self.assertTrue(np.array_equal(result, target)) + np.testing.assert_array_equal(result, target) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph.py b/python/paddle/fluid/tests/unittests/test_cuda_graph.py index 446a5500bc30b..0b795d4c0eba5 100644 --- a/python/paddle/fluid/tests/unittests/test_cuda_graph.py +++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py @@ -174,10 +174,10 @@ def test_concat_and_split(self): y_np = y.numpy() y_np_expected = np.concatenate(xs_np) - self.assertTrue(np.array_equal(y_np, y_np_expected)) + np.testing.assert_array_equal(y_np, y_np_expected) self.assertEqual(len(zs), len(xs_np)) for i, z in enumerate(zs): - self.assertTrue(np.array_equal(z.numpy(), xs_np[i])) + np.testing.assert_array_equal(z.numpy(), xs_np[i]) output_dir = 'cuda_graph_dot_{}'.format(os.getpid()) try: @@ -233,8 +233,8 @@ def __getitem__(self, idx): graph.replay() actual_x = np.array([[i]]).astype(dtype) actual_y = np.array([[i * i]]).astype(dtype) - self.assertTrue(np.array_equal(actual_x, x.numpy())) - self.assertTrue(np.array_equal(actual_y, y.numpy())) + np.testing.assert_array_equal(actual_x, x.numpy()) + np.testing.assert_array_equal(actual_y, y.numpy()) def test_dev_ctx_alloc(self): if not can_use_cuda_graph(): diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph.py b/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph.py index b0e6878e3fef2..ecd3f406e088d 100644 --- a/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph.py +++ b/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph.py @@ -68,9 +68,9 @@ def check(self, func): layer, value2 = self.run_base(func, True, "default") _, value3 = self.run_base(func, True, "new") _, value4 = self.run_base(func, True, layer) - self.assertTrue(np.array_equal(value1, value2)) - self.assertTrue(np.array_equal(value1, value3)) - self.assertTrue(np.array_equal(value1, value4)) + np.testing.assert_array_equal(value1, value2) + np.testing.assert_array_equal(value1, value3) + np.testing.assert_array_equal(value1, value4) def test_layer(self): self.check(SimpleModel(10, 20)) diff --git a/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py b/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py index 5405ca1980689..87c4f6cee5bbd 100644 --- a/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py +++ b/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py @@ -128,7 +128,7 @@ def test_stream_guard_normal(self): # kernels to be completed on windows. s.synchronize() - self.assertTrue(np.array_equal(np.array(c), np.array(d))) + np.testing.assert_array_equal(np.array(c), np.array(d)) def test_stream_guard_default_stream(self): if paddle.is_compiled_with_cuda(): diff --git a/python/paddle/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py index 7e11ad647d963..1989a8c1448a8 100644 --- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py +++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py @@ -31,15 +31,15 @@ def run_cases(self): y = paddle.cumsum(data) z = np.cumsum(data_np) - self.assertTrue(np.array_equal(z, y.numpy())) + np.testing.assert_array_equal(z, y.numpy()) y = paddle.cumsum(data, axis=0) z = np.cumsum(data_np, axis=0) - self.assertTrue(np.array_equal(z, y.numpy())) + np.testing.assert_array_equal(z, y.numpy()) y = paddle.cumsum(data, axis=-1) z = np.cumsum(data_np, axis=-1) - self.assertTrue(np.array_equal(z, y.numpy())) + np.testing.assert_array_equal(z, y.numpy()) y = paddle.cumsum(data, dtype='float64') self.assertTrue(y.dtype == core.VarDesc.VarType.FP64) @@ -49,7 +49,7 @@ def run_cases(self): y = paddle.cumsum(data, axis=-2) z = np.cumsum(data_np, axis=-2) - self.assertTrue(np.array_equal(z, y.numpy())) + np.testing.assert_array_equal(z, y.numpy()) def run_static(self, use_gpu=False): with fluid.program_guard(fluid.Program()): diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py index e2062238b1161..2f7a6dd5a4236 100644 --- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py +++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py @@ -93,8 +93,8 @@ def fake_reader(): L1 = np.array(L1) L2 = np.array(L2) - self.assertTrue(np.array_equal(I1, I2)) - self.assertTrue(np.array_equal(L1, L2)) + np.testing.assert_array_equal(I1, I2) + np.testing.assert_array_equal(L1, L2) batch_id += 1 if break_beforehand and batch_id >= int( diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py index 59d6ce70ddc9b..426af39ca9062 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py @@ -40,8 +40,15 @@ def check_with_place(self, "http_proxy": "", "CPU_NUM": "2", "LOG_DIRNAME": "/tmp", + "SAVE_DIRNAME": "/tmp/TestDistMnistAsyncInMemoryDataset2x2/model", "SAVE_CACHE_DIRNAME": "/tmp/TestDistMnistAsyncInMemoryDataset2x2/cache_model", + "SAVE_DENSE_PARAM_DIRNAME": + "/tmp/TestDistMnistAsyncInMemoryDataset2x2/dense_param", + "SAVE_ONE_TABLE_DIRNAME": + "/tmp/TestDistMnistAsyncInMemoryDataset2x2/table_0", + "SAVE_PATCH_DIRNAME": + "/tmp/TestDistMnistAsyncInMemoryDataset2x2/patch_model", "LOG_PREFIX": self.__class__.__name__, } diff --git a/python/paddle/fluid/tests/unittests/test_dot_op.py b/python/paddle/fluid/tests/unittests/test_dot_op.py index ffdc90dd986ad..1ce352251c149 100644 --- a/python/paddle/fluid/tests/unittests/test_dot_op.py +++ b/python/paddle/fluid/tests/unittests/test_dot_op.py @@ -138,9 +138,8 @@ def test_dygraph(self): np.array([[1, 3], [3, 5]]).astype(np.float32)) y1 = fluid.dygraph.to_variable( np.array([[2, 5], [6, 8]]).astype(np.float32)) - self.assertTrue( - np.array_equal( - paddle.dot(x1, y1).numpy(), np.array([[17], [58]]))) + np.testing.assert_array_equal( + paddle.dot(x1, y1).numpy(), np.array([[17], [58]])) class TestComplexDotOp(OpTest): diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py index 33992b1881ec4..a5f33288362fd 100644 --- a/python/paddle/fluid/tests/unittests/test_dropout_op.py +++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py @@ -1013,10 +1013,9 @@ def test_backward_downscale_in_infer(self): out, mask = core.ops.dropout(input, 'dropout_prob', 0.5) out.backward() - self.assertTrue( - np.array_equal( - input.gradient(), - self.cal_grad_downscale_in_infer(mask.numpy()))) + np.testing.assert_array_equal( + input.gradient(), + self.cal_grad_downscale_in_infer(mask.numpy())) def test_backward_downscale_in_infer_eager(self): for place in self.places: @@ -1027,10 +1026,9 @@ def test_backward_downscale_in_infer_eager(self): out, mask = _C_ops.final_state_dropout( input, None, 0.5, False, "downgrade_in_infer", 0, False) out.backward() - self.assertTrue( - np.array_equal( - input.gradient(), - self.cal_grad_downscale_in_infer(mask.numpy()))) + np.testing.assert_array_equal( + input.gradient(), + self.cal_grad_downscale_in_infer(mask.numpy())) def test_backward_upscale_train(self): _enable_legacy_dygraph() @@ -1103,6 +1101,47 @@ def test_backward_upscale_train_2_eager(self): self.cal_grad_upscale_train(mask.numpy(), prob))) +class TestDropOutWithProbTensor(unittest.TestCase): + + def setUp(self): + shapes = [[10, 10], [10, 10, 10], [10, 10, 10, 10]] + self.inputs = [ + np.random.random(shape).astype("float32") for shape in shapes + ] + self.place = paddle.CUDAPlace( + 0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace() + + def api_case(self, x): + p = paddle.assign([0.5]) + out = paddle.nn.functional.dropout(x=x, p=p, training=True) + return out + + def run_static(self, x): + paddle.seed(2022) + main_program = Program() + + with program_guard(main_program): + input = paddle.static.data(shape=x.shape, name='x', dtype='float32') + out = self.api_case(input) + + exe = paddle.static.Executor(self.place) + res = exe.run(feed={'x': x}, fetch_list=[out]) + + return res[0] + + def run_dygraph(self, x): + paddle.seed(2022) + with fluid.dygraph.guard(self.place): + out = self.api_case(paddle.to_tensor(x)) + return out + + def test_p_tensor(self): + for x in self.inputs: + static_res = self.run_static(x) + dygraph_res = self.run_dygraph(x) + self.assertTrue(np.array_equal(static_res, dygraph_res)) + + class TestRandomValue(unittest.TestCase): def test_fixed_random_number(self): diff --git a/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py b/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py index 167748c5a98be..22fb98a7c61f4 100644 --- a/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py @@ -82,7 +82,7 @@ def run_main(self, place): value2 = build_and_run_program(place, self.batch_size, self.beam_size, True) - self.assertTrue(np.array_equal(value1, value2)) + np.testing.assert_array_equal(value1, value2) def test_check_main(self): places = [fluid.CPUPlace()] diff --git a/python/paddle/fluid/tests/unittests/test_eager_run_program.py b/python/paddle/fluid/tests/unittests/test_eager_run_program.py index 8d3ebcfbac5ac..a04c544e90257 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_run_program.py +++ b/python/paddle/fluid/tests/unittests/test_eager_run_program.py @@ -108,11 +108,11 @@ def test_eager(self): loss = paddle.mean(out_t) loss.backward() - self.assertTrue(np.array_equal(np.ones([2, 2]) * 4, out_t.numpy())) - self.assertTrue( - np.array_equal(np.ones([2, 4]) * 0.5, x_t.grad.numpy())) - self.assertTrue( - np.array_equal(np.ones([4, 2]) * 0.5, y_t.grad.numpy())) + np.testing.assert_array_equal(np.ones([2, 2]) * 4, out_t.numpy()) + np.testing.assert_array_equal( + np.ones([2, 4]) * 0.5, x_t.grad.numpy()) + np.testing.assert_array_equal( + np.ones([4, 2]) * 0.5, y_t.grad.numpy()) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py b/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py index 4afbe2d715592..26702d682d166 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py @@ -32,7 +32,7 @@ def test_elementwise_add(self): out_arr = out.numpy() out_arr_expected = np.add(np_x, np_y) - self.assertTrue(np.array_equal(out_arr, out_arr_expected)) + np.testing.assert_array_equal(out_arr, out_arr_expected) def test_sum(self): with _test_eager_guard(): @@ -42,7 +42,7 @@ def test_sum(self): out = paddle.sum(x, axis=0) out_arr = out.numpy() out_arr_expected = np.sum(x_data, axis=0) - self.assertTrue(np.array_equal(out_arr, out_arr_expected)) + np.testing.assert_array_equal(out_arr, out_arr_expected) def test_mm(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index 7fe755225f41a..5a678075a111e 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -54,7 +54,7 @@ def test_retain_grad_and_run_backward(self): self.assertIsNone(data_eager.grad) out_eager.backward(grad_eager, False) self.assertIsNotNone(data_eager.grad) - self.assertTrue(np.array_equal(data_eager.grad.numpy(), input_data)) + np.testing.assert_array_equal(data_eager.grad.numpy(), input_data) def test_retain_grad_and_run_backward_raises(self): with _test_eager_guard(): @@ -92,7 +92,7 @@ def check_to_tesnsor_and_numpy(self, dtype, proto_dtype): arr = np.random.random([4, 16, 16, 32]).astype(dtype) tensor = paddle.to_tensor(arr, dtype) self.assertEqual(tensor.dtype, proto_dtype) - self.assertTrue(np.array_equal(arr, tensor.numpy())) + np.testing.assert_array_equal(arr, tensor.numpy()) def test_dtype_base(self): print("Test_dtype") @@ -138,7 +138,7 @@ def constructor(self, place): self.assertEqual(egr_tensor1.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor1.stop_gradient, False) self.assertTrue(egr_tensor1.place._equals(place)) - self.assertTrue(np.array_equal(egr_tensor1.numpy(), arr0)) + np.testing.assert_array_equal(egr_tensor1.numpy(), arr0) arr1 = np.random.randint(100, size=(4, 16, 16, 32), dtype=np.int64) egr_tensor2 = core.eager.Tensor(arr1, place, False, True, @@ -149,7 +149,7 @@ def constructor(self, place): self.assertEqual(egr_tensor2.dtype, core.VarDesc.VarType.INT64) self.assertEqual(egr_tensor2.stop_gradient, True) self.assertTrue(egr_tensor2.place._equals(place)) - self.assertTrue(np.array_equal(egr_tensor2.numpy(), arr1)) + np.testing.assert_array_equal(egr_tensor2.numpy(), arr1) arr2 = np.random.rand(4, 16, 16, 32, 64).astype('float32') egr_tensor3 = core.eager.Tensor(arr2) @@ -161,7 +161,7 @@ def constructor(self, place): self.assertTrue( egr_tensor3.place._equals( paddle.fluid.framework._current_expected_place())) - self.assertTrue(np.array_equal(egr_tensor3.numpy(), arr2)) + np.testing.assert_array_equal(egr_tensor3.numpy(), arr2) egr_tensor3.stop_gradient = False egr_tensor4 = core.eager.Tensor(egr_tensor3) @@ -173,8 +173,7 @@ def constructor(self, place): self.assertTrue( egr_tensor4.place._equals( paddle.fluid.framework._current_expected_place())) - self.assertTrue(np.array_equal(egr_tensor4.numpy(), - egr_tensor3.numpy())) + np.testing.assert_array_equal(egr_tensor4.numpy(), egr_tensor3.numpy()) arr4 = np.random.rand(4, 16, 16, 32).astype('float32') egr_tensor5 = core.eager.Tensor(arr4, place) @@ -184,7 +183,7 @@ def constructor(self, place): self.assertEqual(egr_tensor5.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor5.stop_gradient, True) self.assertTrue(egr_tensor5.place._equals(place)) - self.assertTrue(np.array_equal(egr_tensor5.numpy(), arr4)) + np.testing.assert_array_equal(egr_tensor5.numpy(), arr4) egr_tensor6 = core.eager.Tensor(egr_tensor5, core.CPUPlace()) self.assertEqual(egr_tensor6.persistable, False) @@ -193,8 +192,7 @@ def constructor(self, place): self.assertEqual(egr_tensor6.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor6.stop_gradient, True) self.assertEqual(egr_tensor6.place.is_cpu_place(), True) - self.assertTrue(np.array_equal(egr_tensor6.numpy(), - egr_tensor5.numpy())) + np.testing.assert_array_equal(egr_tensor6.numpy(), egr_tensor5.numpy()) egr_tensor7 = core.eager.Tensor(arr4, place, True) self.assertEqual(egr_tensor7.persistable, True) @@ -203,7 +201,7 @@ def constructor(self, place): self.assertEqual(egr_tensor7.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor7.stop_gradient, True) self.assertTrue(egr_tensor7.place._equals(place)) - self.assertTrue(np.array_equal(egr_tensor7.numpy(), arr4)) + np.testing.assert_array_equal(egr_tensor7.numpy(), arr4) egr_tensor8 = core.eager.Tensor(egr_tensor6, place, "egr_tensor8") self.assertEqual(egr_tensor8.persistable, False) @@ -212,8 +210,7 @@ def constructor(self, place): self.assertEqual(egr_tensor8.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor8.stop_gradient, True) self.assertTrue(egr_tensor8.place._equals(place)) - self.assertTrue(np.array_equal(egr_tensor8.numpy(), - egr_tensor5.numpy())) + np.testing.assert_array_equal(egr_tensor8.numpy(), egr_tensor5.numpy()) egr_tensor9 = core.eager.Tensor(arr4, place, True, True) self.assertEqual(egr_tensor9.persistable, True) @@ -222,7 +219,7 @@ def constructor(self, place): self.assertEqual(egr_tensor9.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor9.stop_gradient, True) self.assertTrue(egr_tensor9.place._equals(place)) - self.assertTrue(np.array_equal(egr_tensor9.numpy(), arr4)) + np.testing.assert_array_equal(egr_tensor9.numpy(), arr4) x = np.random.rand(3, 3).astype('float32') t = paddle.fluid.Tensor() @@ -234,7 +231,7 @@ def constructor(self, place): self.assertEqual(egr_tensor10.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor10.stop_gradient, True) self.assertTrue(egr_tensor10.place._equals(place)) - self.assertTrue(np.array_equal(egr_tensor10.numpy(), x)) + np.testing.assert_array_equal(egr_tensor10.numpy(), x) egr_tensor11 = core.eager.Tensor(t, place, "framework_constructed") self.assertEqual(egr_tensor11.persistable, False) @@ -243,7 +240,7 @@ def constructor(self, place): self.assertEqual(egr_tensor11.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor11.stop_gradient, True) self.assertTrue(egr_tensor11.place._equals(place)) - self.assertTrue(np.array_equal(egr_tensor11.numpy(), x)) + np.testing.assert_array_equal(egr_tensor11.numpy(), x) egr_tensor12 = core.eager.Tensor(t) self.assertEqual(egr_tensor12.persistable, False) @@ -252,7 +249,7 @@ def constructor(self, place): self.assertEqual(egr_tensor12.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor12.stop_gradient, True) self.assertTrue(egr_tensor12.place._equals(paddle.fluid.CPUPlace())) - self.assertTrue(np.array_equal(egr_tensor12.numpy(), x)) + np.testing.assert_array_equal(egr_tensor12.numpy(), x) with self.assertRaisesRegexp( ValueError, "The shape of Parameter should not be None"): @@ -489,8 +486,7 @@ def constructor_with_kwargs(self, place): self.assertTrue( egr_tensor15.place._equals( paddle.fluid.framework._current_expected_place())) - self.assertTrue( - np.array_equal(egr_tensor15.numpy(), egr_tensor4.numpy())) + np.testing.assert_array_equal(egr_tensor15.numpy(), egr_tensor4.numpy()) egr_tensor16 = core.eager.Tensor(value=egr_tensor4, name="new_eager_tensor") @@ -502,8 +498,7 @@ def constructor_with_kwargs(self, place): self.assertTrue( egr_tensor16.place._equals( paddle.fluid.framework._current_expected_place())) - self.assertTrue( - np.array_equal(egr_tensor16.numpy(), egr_tensor4.numpy())) + np.testing.assert_array_equal(egr_tensor16.numpy(), egr_tensor4.numpy()) egr_tensor17 = core.eager.Tensor( value=egr_tensor4, @@ -516,8 +511,7 @@ def constructor_with_kwargs(self, place): self.assertEqual(egr_tensor17.dtype, egr_tensor4.dtype) self.assertEqual(egr_tensor17.stop_gradient, True) self.assertTrue(egr_tensor17.place._equals(place)) - self.assertTrue( - np.array_equal(egr_tensor17.numpy(), egr_tensor4.numpy())) + np.testing.assert_array_equal(egr_tensor17.numpy(), egr_tensor4.numpy()) egr_tensor18 = core.eager.Tensor( egr_tensor4, @@ -530,8 +524,7 @@ def constructor_with_kwargs(self, place): self.assertEqual(egr_tensor18.dtype, egr_tensor4.dtype) self.assertEqual(egr_tensor18.stop_gradient, True) self.assertTrue(egr_tensor18.place._equals(place)) - self.assertTrue( - np.array_equal(egr_tensor18.numpy(), egr_tensor4.numpy())) + np.testing.assert_array_equal(egr_tensor18.numpy(), egr_tensor4.numpy()) egr_tensor19 = core.eager.Tensor( egr_tensor4, @@ -544,8 +537,7 @@ def constructor_with_kwargs(self, place): self.assertEqual(egr_tensor19.dtype, egr_tensor4.dtype) self.assertEqual(egr_tensor19.stop_gradient, True) self.assertTrue(egr_tensor19.place._equals(place)) - self.assertTrue( - np.array_equal(egr_tensor19.numpy(), egr_tensor4.numpy())) + np.testing.assert_array_equal(egr_tensor19.numpy(), egr_tensor4.numpy()) # init eager tensor by framework tensor x = np.random.rand(3, 3).astype('float32') @@ -560,7 +552,7 @@ def constructor_with_kwargs(self, place): self.assertTrue( egr_tensor20.place._equals( paddle.fluid.framework._current_expected_place())) - self.assertTrue(np.array_equal(egr_tensor20.numpy(), x)) + np.testing.assert_array_equal(egr_tensor20.numpy(), x) egr_tensor21 = core.eager.Tensor(value=t, place=place) self.assertEqual(egr_tensor21.persistable, False) @@ -569,7 +561,7 @@ def constructor_with_kwargs(self, place): self.assertEqual(egr_tensor21.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor21.stop_gradient, True) self.assertTrue(egr_tensor21.place._equals(place)) - self.assertTrue(np.array_equal(egr_tensor21.numpy(), x)) + np.testing.assert_array_equal(egr_tensor21.numpy(), x) egr_tensor22 = core.eager.Tensor(t, place=place) self.assertEqual(egr_tensor22.persistable, False) @@ -578,7 +570,7 @@ def constructor_with_kwargs(self, place): self.assertEqual(egr_tensor22.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor22.stop_gradient, True) self.assertTrue(egr_tensor22.place._equals(place)) - self.assertTrue(np.array_equal(egr_tensor22.numpy(), x)) + np.testing.assert_array_equal(egr_tensor22.numpy(), x) egr_tensor23 = core.eager.Tensor(t, place, name="from_framework_tensor") self.assertEqual(egr_tensor23.persistable, False) @@ -587,7 +579,7 @@ def constructor_with_kwargs(self, place): self.assertEqual(egr_tensor23.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor23.stop_gradient, True) self.assertTrue(egr_tensor23.place._equals(place)) - self.assertTrue(np.array_equal(egr_tensor23.numpy(), x)) + np.testing.assert_array_equal(egr_tensor23.numpy(), x) egr_tensor24 = core.eager.Tensor(value=t, place=place, @@ -598,7 +590,7 @@ def constructor_with_kwargs(self, place): self.assertEqual(egr_tensor24.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor24.stop_gradient, True) self.assertTrue(egr_tensor24.place._equals(place)) - self.assertTrue(np.array_equal(egr_tensor24.numpy(), x)) + np.testing.assert_array_equal(egr_tensor24.numpy(), x) # Bad usage # SyntaxError: positional argument follows keyword argument @@ -632,53 +624,53 @@ def test_copy_and_copy_to(self): core.CPUPlace()) tensor1.persistable = True self.assertEqual(tensor1.stop_gradient, True) - self.assertTrue(np.array_equal(tensor.numpy(), arr)) + np.testing.assert_array_equal(tensor.numpy(), arr) print("Test copy_") tensor.copy_(tensor1, True) self.assertEqual(tensor.persistable, False) self.assertEqual(tensor.shape, [4, 16]) self.assertEqual(tensor.dtype, core.VarDesc.VarType.FP32) - self.assertTrue(np.array_equal(tensor.numpy(), arr1)) + np.testing.assert_array_equal(tensor.numpy(), arr1) print("Test _copy_to") tensor2 = paddle.to_tensor(arr2, core.VarDesc.VarType.FP32, core.CPUPlace()) - self.assertTrue(np.array_equal(tensor2.numpy(), arr2)) + np.testing.assert_array_equal(tensor2.numpy(), arr2) self.assertTrue(tensor2.place.is_cpu_place()) tensor2.persistable = True tensor2.stop_gradient = False if core.is_compiled_with_cuda(): tensor3 = tensor2._copy_to(core.CUDAPlace(0), True) - self.assertTrue(np.array_equal(tensor3.numpy(), arr2)) + np.testing.assert_array_equal(tensor3.numpy(), arr2) self.assertEqual(tensor3.persistable, True) self.assertEqual(tensor3.stop_gradient, True) self.assertTrue(tensor3.place.is_gpu_place()) tensor4 = tensor2.cuda(0, True) - self.assertTrue(np.array_equal(tensor4.numpy(), arr2)) + np.testing.assert_array_equal(tensor4.numpy(), arr2) self.assertEqual(tensor4.persistable, True) self.assertEqual(tensor4.stop_gradient, False) self.assertTrue(tensor4.place.is_gpu_place()) tensor5 = tensor4.cpu() - self.assertTrue(np.array_equal(tensor5.numpy(), arr2)) + np.testing.assert_array_equal(tensor5.numpy(), arr2) self.assertEqual(tensor5.persistable, True) self.assertEqual(tensor5.stop_gradient, False) self.assertTrue(tensor5.place.is_cpu_place()) tensor10 = paddle.to_tensor([1, 2, 3], place='gpu_pinned') tensor11 = tensor10._copy_to(core.CUDAPlace(0), True) - self.assertTrue( - np.array_equal(tensor10.numpy(), tensor11.numpy())) + np.testing.assert_array_equal(tensor10.numpy(), + tensor11.numpy()) else: tensor3 = tensor2._copy_to(core.CPUPlace(), True) - self.assertTrue(np.array_equal(tensor3.numpy(), arr2)) + np.testing.assert_array_equal(tensor3.numpy(), arr2) self.assertEqual(tensor3.persistable, True) self.assertEqual(tensor3.stop_gradient, True) self.assertTrue(tensor3.place.is_cpu_place()) tensor4 = tensor2.cpu() - self.assertTrue(np.array_equal(tensor4.numpy(), arr2)) + np.testing.assert_array_equal(tensor4.numpy(), arr2) self.assertEqual(tensor4.persistable, True) self.assertEqual(tensor4.stop_gradient, False) self.assertTrue(tensor4.place.is_cpu_place()) @@ -700,15 +692,15 @@ def test_share_buffer_to(self): else: tensor2 = paddle.to_tensor(arr2, core.VarDesc.VarType.FP32, core.CPUPlace()) - self.assertTrue(np.array_equal(tensor.numpy(), arr)) - self.assertTrue(np.array_equal(tensor2.numpy(), arr2)) + np.testing.assert_array_equal(tensor.numpy(), arr) + np.testing.assert_array_equal(tensor2.numpy(), arr2) tensor2._share_buffer_to(tensor) - self.assertTrue(np.array_equal(tensor.numpy(), arr2)) - self.assertTrue(np.array_equal(tensor2.numpy(), arr2)) + np.testing.assert_array_equal(tensor.numpy(), arr2) + np.testing.assert_array_equal(tensor2.numpy(), arr2) self.assertTrue(tensor._is_shared_buffer_with(tensor2)) self.assertTrue(tensor2._is_shared_buffer_with(tensor)) tensor._share_buffer_to(tensor3) - self.assertTrue(np.array_equal(tensor3.numpy(), arr2)) + np.testing.assert_array_equal(tensor3.numpy(), arr2) self.assertTrue(tensor3._is_shared_buffer_with(tensor)) def test_share_underline_tensor_to(self): @@ -728,15 +720,15 @@ def test_share_underline_tensor_to(self): else: tensor2 = paddle.to_tensor(arr2, core.VarDesc.VarType.FP32, core.CPUPlace()) - self.assertTrue(np.array_equal(tensor.numpy(), arr)) - self.assertTrue(np.array_equal(tensor2.numpy(), arr2)) + np.testing.assert_array_equal(tensor.numpy(), arr) + np.testing.assert_array_equal(tensor2.numpy(), arr2) tensor2._share_underline_tensor_to(tensor) - self.assertTrue(np.array_equal(tensor.numpy(), arr2)) - self.assertTrue(np.array_equal(tensor2.numpy(), arr2)) + np.testing.assert_array_equal(tensor.numpy(), arr2) + np.testing.assert_array_equal(tensor2.numpy(), arr2) self.assertTrue(tensor._is_shared_underline_tensor_with(tensor2)) self.assertTrue(tensor2._is_shared_underline_tensor_with(tensor)) tensor._share_underline_tensor_to(tensor3) - self.assertTrue(np.array_equal(tensor3.numpy(), arr2)) + np.testing.assert_array_equal(tensor3.numpy(), arr2) self.assertTrue(tensor3._is_shared_underline_tensor_with(tensor)) def test_properties(self): @@ -810,7 +802,7 @@ def test_set_value(self): egr_tensor = core.eager.Tensor(value=ori_arr) self.assertEqual(egr_tensor.stop_gradient, True) self.assertEqual(egr_tensor.shape, [4, 16, 16, 32]) - self.assertTrue(np.array_equal(egr_tensor.numpy(), ori_arr)) + np.testing.assert_array_equal(egr_tensor.numpy(), ori_arr) ori_place = egr_tensor.place new_arr = np.random.rand(4, 16, 16, 32).astype('float32') @@ -820,7 +812,7 @@ def test_set_value(self): self.assertEqual(egr_tensor.stop_gradient, True) self.assertTrue(egr_tensor.place._equals(ori_place)) self.assertEqual(egr_tensor.shape, [4, 16, 16, 32]) - self.assertTrue(np.array_equal(egr_tensor.numpy(), new_arr)) + np.testing.assert_array_equal(egr_tensor.numpy(), new_arr) def test_sharding_related_api(self): with _test_eager_guard(): @@ -839,7 +831,7 @@ def test_copy_gradient_from(self): out = x + x out.backward() x._copy_gradient_from(y) - self.assertTrue(np.array_equal(x.grad.numpy(), np_y)) + np.testing.assert_array_equal(x.grad.numpy(), np_y) def test_clear(self): with _test_eager_guard(): @@ -862,11 +854,10 @@ def test_copy(self): linear = paddle.nn.Linear(1, 3) linear_copy = copy.deepcopy(linear) linear_copy2 = linear.weight._copy_to(core.CPUPlace(), True) - self.assertTrue( - np.array_equal(linear.weight.numpy(), - linear_copy.weight.numpy())) - self.assertTrue( - np.array_equal(linear.weight.numpy(), linear_copy2.numpy())) + np.testing.assert_array_equal(linear.weight.numpy(), + linear_copy.weight.numpy()) + np.testing.assert_array_equal(linear.weight.numpy(), + linear_copy2.numpy()) def func_fp16_initilaizer(self): paddle.set_default_dtype("float16") @@ -905,7 +896,7 @@ def test_fp16_initializer(self): res2 = self.func_fp16_initilaizer() for i in range(len(res1)): - self.assertTrue(np.array_equal(res1[i], res2[i])) + np.testing.assert_array_equal(res1[i], res2[i]) def func_layer_helper_base(self, value): base = paddle.fluid.layer_helper_base.LayerHelperBase( @@ -924,8 +915,8 @@ def test_to_variable(self): res3 = self.func_base_to_variable(value) res2 = self.func_layer_helper_base(value) res4 = self.func_base_to_variable(value) - self.assertTrue(np.array_equal(res1, res2)) - self.assertTrue(np.array_equal(res3, res4)) + np.testing.assert_array_equal(res1, res2) + np.testing.assert_array_equal(res3, res4) def test_backward_with_single_tensor(self): with _test_eager_guard(): @@ -939,11 +930,11 @@ def test_backward_with_single_tensor(self): self.assertEqual(egr_tensor12.dtype, core.VarDesc.VarType.FP32) self.assertEqual(egr_tensor12.stop_gradient, True) self.assertTrue(egr_tensor12.place._equals(paddle.fluid.CPUPlace())) - self.assertTrue(np.array_equal(egr_tensor12.numpy(), arr4)) - self.assertTrue(np.array_equal(egr_tensor12.gradient(), None)) + np.testing.assert_array_equal(egr_tensor12.numpy(), arr4) + np.testing.assert_array_equal(egr_tensor12.gradient(), None) egr_tensor12.stop_gradient = False egr_tensor12.backward() - self.assertTrue(np.array_equal(egr_tensor12.gradient(), arr)) + np.testing.assert_array_equal(egr_tensor12.gradient(), arr) def test_set_value(self): with _test_eager_guard(): @@ -953,7 +944,7 @@ def test_set_value(self): self.assertFalse(np.array_equal(linear.weight.numpy(), new_weight)) linear.weight.set_value(new_weight) - self.assertTrue(np.array_equal(linear.weight.numpy(), new_weight)) + np.testing.assert_array_equal(linear.weight.numpy(), new_weight) self.assertTrue(linear.weight.place._equals(ori_place)) diff --git a/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py b/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py index 3b5ec683bc7bd..64f8d5bbedc69 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py @@ -40,28 +40,28 @@ def test_constructor_with_args(self): ST2 = core.eager.StringTensor(shape, "ST2") # constructor 2 self.assertEqual(ST2.name, "ST2") self.assertEqual(ST2.shape, shape) - self.assertTrue( - np.array_equal(ST2.numpy(), np.empty(shape, dtype=np.unicode_))) + np.testing.assert_array_equal(ST2.numpy(), + np.empty(shape, dtype=np.unicode_)) ST3 = core.eager.StringTensor(self.str_arr, "ST3") # constructor 3 self.assertEqual(ST3.name, "ST3") self.assertEqual(ST3.shape, list(self.str_arr.shape)) - self.assertTrue(np.array_equal(ST3.numpy(), self.str_arr)) + np.testing.assert_array_equal(ST3.numpy(), self.str_arr) ST4 = core.eager.StringTensor(self.str_arr) # constructor 4 self.assertEqual(ST4.name, "generated_string_tensor_1") self.assertEqual(ST4.shape, list(self.str_arr.shape)) - self.assertTrue(np.array_equal(ST4.numpy(), self.str_arr)) + np.testing.assert_array_equal(ST4.numpy(), self.str_arr) ST5 = core.eager.StringTensor(ST4) # constructor 5 self.assertEqual(ST5.name, "generated_string_tensor_2") self.assertEqual(ST5.shape, list(self.str_arr.shape)) - self.assertTrue(np.array_equal(ST5.numpy(), self.str_arr)) + np.testing.assert_array_equal(ST5.numpy(), self.str_arr) ST6 = core.eager.StringTensor(ST5, "ST6") # constructor 6 self.assertEqual(ST6.name, "ST6") self.assertEqual(ST6.shape, list(self.str_arr.shape)) - self.assertTrue(np.array_equal(ST6.numpy(), self.str_arr)) + np.testing.assert_array_equal(ST6.numpy(), self.str_arr) for st in [ST1, ST2, ST3, ST4, ST5, ST6]: # All StringTensors are on cpu place so far. @@ -74,25 +74,25 @@ def test_constructor_with_kwargs(self): name="ST1") # constructor 2 self.assertEqual(ST1.name, "ST1") self.assertEqual(ST1.shape, shape) - self.assertTrue( - np.array_equal(ST1.numpy(), np.empty(shape, dtype=np.unicode_))) + np.testing.assert_array_equal(ST1.numpy(), + np.empty(shape, dtype=np.unicode_)) ST2 = core.eager.StringTensor(self.str_arr, name="ST2") # constructor 3 self.assertEqual(ST2.name, "ST2") self.assertEqual(ST2.shape, list(self.str_arr.shape)) - self.assertTrue(np.array_equal(ST2.numpy(), self.str_arr)) + np.testing.assert_array_equal(ST2.numpy(), self.str_arr) ST3 = core.eager.StringTensor(ST2, name="ST3") # constructor 6 self.assertEqual(ST3.name, "ST3") self.assertEqual(ST3.shape, list(self.str_arr.shape)) - self.assertTrue(np.array_equal(ST3.numpy(), self.str_arr)) + np.testing.assert_array_equal(ST3.numpy(), self.str_arr) ST4 = core.eager.StringTensor(value=ST2, name="ST4") # constructor 6 self.assertEqual(ST4.name, "ST4") self.assertEqual(ST4.shape, list(self.str_arr.shape)) - self.assertTrue(np.array_equal(ST4.numpy(), self.str_arr)) + np.testing.assert_array_equal(ST4.numpy(), self.str_arr) for st in [ST1, ST2, ST3, ST4]: # All StringTensors are on cpu place so far. self.assertTrue(st.place._equals(core.CPUPlace())) diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py index e23662483919f..25a0c0a0652c2 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py @@ -206,11 +206,15 @@ def check_main(self, x_shape, y_shape, axis=-1): z_1, x_g_1, y_g_1 = self.get_out_and_grad(x_np, y_np, axis, place, False) z_2, x_g_2, y_g_2 = self.get_out_and_grad(x_np, y_np, axis, place, True) - self.assertTrue(np.array_equal(z_1, z_2), "{} vs {}".format(z_1, z_2)) - self.assertTrue(np.array_equal(x_g_1, x_g_2), - "{} vs {}".format(x_g_1, x_g_2)) - self.assertTrue(np.array_equal(y_g_1, y_g_2), - "{} vs {}".format(y_g_1, y_g_2)) + np.testing.assert_array_equal(z_1, + z_2, + err_msg='{} vs {}'.format(z_1, z_2)) + np.testing.assert_array_equal(x_g_1, + x_g_2, + err_msg='{} vs {}'.format(x_g_1, x_g_2)) + np.testing.assert_array_equal(y_g_1, + y_g_2, + err_msg='{} vs {}'.format(y_g_1, y_g_2)) def test_main(self): self.check_main((13, 17), (13, 17)) diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py index 12f2a21736084..904b9fe06de74 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py @@ -210,9 +210,9 @@ def test_grad(self): y.stop_gradient = False res = x**y res.backward() - self.assertTrue(np.array_equal(res.gradient(), self.grad_res)) - self.assertTrue(np.array_equal(x.gradient(), self.grad_x)) - self.assertTrue(np.array_equal(y.gradient(), self.grad_y)) + np.testing.assert_array_equal(res.gradient(), self.grad_res) + np.testing.assert_array_equal(x.gradient(), self.grad_x) + np.testing.assert_array_equal(y.gradient(), self.grad_y) fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) diff --git a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py index 74d101497b8ed..b42a85fbb91bd 100644 --- a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py @@ -36,7 +36,7 @@ def test_check_grad(self): for p in self.get_places(): grad_value1 = self.run_program(p, stop_gradient=False) grad_value2 = self.run_program(p, stop_gradient=True) - self.assertTrue(np.array_equal(grad_value1, grad_value2)) + np.testing.assert_array_equal(grad_value1, grad_value2) def run_program(self, place, stop_gradient=False): np.random.seed(1) diff --git a/python/paddle/fluid/tests/unittests/test_executor_check_fetch_list.py b/python/paddle/fluid/tests/unittests/test_executor_check_fetch_list.py index 9d1c902fdc29d..ec7271935569b 100644 --- a/python/paddle/fluid/tests/unittests/test_executor_check_fetch_list.py +++ b/python/paddle/fluid/tests/unittests/test_executor_check_fetch_list.py @@ -50,7 +50,7 @@ def test_with_tuple(self): fetch_list=[self.fetch_list], # support single list/tuple return_numpy=True) - self.assertTrue(np.array_equal(res[0], self.expected)) + np.testing.assert_array_equal(res[0], self.expected) def test_with_error(self): with self.assertRaises(TypeError): diff --git a/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py b/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py index 81bc702128052..a78528a73d84c 100644 --- a/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py +++ b/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py @@ -65,7 +65,7 @@ def test_executor_run_twice(self): add_out1 = np.array(add_out[0]) mul_out = self.calc_mul_out(place, parallel) add_out2 = np.array(add_out[0]) - self.assertTrue(np.array_equal(add_out1, add_out2)) + np.testing.assert_array_equal(add_out1, add_out2) class TestExecutorReturnTensorNotOverOverwritingWithLayers(unittest.TestCase): @@ -108,7 +108,7 @@ def test_executor_run_twice(self): add_out1 = np.array(add_out[0]) sub_out = self.calc_sub_out(place, parallel) add_out2 = np.array(add_out[0]) - self.assertTrue(np.array_equal(add_out1, add_out2)) + np.testing.assert_array_equal(add_out1, add_out2) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_expand_op.py b/python/paddle/fluid/tests/unittests/test_expand_op.py index d0d9a1f7e21fe..9336c4a405e21 100644 --- a/python/paddle/fluid/tests/unittests/test_expand_op.py +++ b/python/paddle/fluid/tests/unittests/test_expand_op.py @@ -272,10 +272,8 @@ def test_expand_times_is_tensor(self): c = paddle.fluid.layers.expand(a, expand_times=paddle.to_tensor( [2, 3], dtype='int32')) - self.assertTrue( - np.array_equal(b.numpy(), np.tile(a.numpy(), [2, 3]))) - self.assertTrue( - np.array_equal(c.numpy(), np.tile(a.numpy(), [2, 3]))) + np.testing.assert_array_equal(b.numpy(), np.tile(a.numpy(), [2, 3])) + np.testing.assert_array_equal(c.numpy(), np.tile(a.numpy(), [2, 3])) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py index 52b9234263d96..6fc6fc8f7eb6b 100644 --- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py @@ -277,11 +277,11 @@ def test_expand_times_is_tensor(self): np_array = np.array([2, 5]) expand_2 = paddle.expand(a, shape=np_array) - self.assertTrue( - np.array_equal(egr_expand_1.numpy(), egr_expand_2.numpy())) - self.assertTrue(np.array_equal(expand_1.numpy(), expand_2.numpy())) - self.assertTrue( - np.array_equal(expand_1.numpy(), egr_expand_1.numpy())) + np.testing.assert_array_equal(egr_expand_1.numpy(), + egr_expand_2.numpy()) + np.testing.assert_array_equal(expand_1.numpy(), expand_2.numpy()) + np.testing.assert_array_equal(expand_1.numpy(), + egr_expand_1.numpy()) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_exponential_op.py b/python/paddle/fluid/tests/unittests/test_exponential_op.py index 72b4d8990446d..2438b754a1238 100644 --- a/python/paddle/fluid/tests/unittests/test_exponential_op.py +++ b/python/paddle/fluid/tests/unittests/test_exponential_op.py @@ -94,7 +94,7 @@ def test_dygraph(self): self.assertTrue(np.min(y.numpy()) >= 0) y.backward() - self.assertTrue(np.array_equal(x.grad.numpy(), np.zeros([10, 10]))) + np.testing.assert_array_equal(x.grad.numpy(), np.zeros([10, 10])) paddle.enable_static() def test_fixed_random_number(self): diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py index 439296e4d8f84..ac69b8e8c6ff3 100644 --- a/python/paddle/fluid/tests/unittests/test_fc_op.py +++ b/python/paddle/fluid/tests/unittests/test_fc_op.py @@ -173,7 +173,7 @@ def run_program(num_flatten_dims): res_1 = run_program(-1) res_2 = run_program(2) - self.assertTrue(np.array_equal(res_1, res_2)) + np.testing.assert_array_equal(res_1, res_2) class TestFCOpError(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_fetch_var.py b/python/paddle/fluid/tests/unittests/test_fetch_var.py index 2a0d29be47dad..1641adbb30cb6 100644 --- a/python/paddle/fluid/tests/unittests/test_fetch_var.py +++ b/python/paddle/fluid/tests/unittests/test_fetch_var.py @@ -17,14 +17,14 @@ import paddle.fluid as fluid import paddle.fluid.layers as layers import op_test -import numpy +import numpy as np import unittest class TestFetchVar(unittest.TestCase): def set_input(self): - self.val = numpy.array([1, 3, 5]).astype(numpy.int32) + self.val = np.array([1, 3, 5]).astype(np.int32) def test_fetch_var(self): self.set_input() @@ -33,15 +33,17 @@ def test_fetch_var(self): exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_main_program(), feed={}, fetch_list=[]) fetched_x = fluid.executor._fetch_var("x") - self.assertTrue(numpy.array_equal(fetched_x, self.val), - "fetch_x=%s val=%s" % (fetched_x, self.val)) + np.testing.assert_array_equal(fetched_x, + self.val, + err_msg='fetch_x=%s val=%s' % + (fetched_x, self.val)) self.assertEqual(fetched_x.dtype, self.val.dtype) class TestFetchNullVar(TestFetchVar): def set_input(self): - self.val = numpy.array([]).astype(numpy.int32) + self.val = np.array([]).astype(np.int32) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_fill_any_op.py b/python/paddle/fluid/tests/unittests/test_fill_any_op.py index 1262c28edda84..305f2dfd3b173 100644 --- a/python/paddle/fluid/tests/unittests/test_fill_any_op.py +++ b/python/paddle/fluid/tests/unittests/test_fill_any_op.py @@ -16,9 +16,11 @@ import paddle import paddle.fluid.core as core +from paddle.fluid.framework import _test_eager_guard, in_dygraph_mode import unittest import numpy as np from op_test import OpTest +from paddle.tensor.manipulation import fill_ class TestFillAnyOp(OpTest): @@ -75,5 +77,41 @@ def init(self): self.value = 11111.1111 +class TestFillAnyInplace(unittest.TestCase): + + def test_fill_any_version(self): + with paddle.fluid.dygraph.guard(): + var = paddle.to_tensor(np.ones((4, 2, 3)).astype(np.float32)) + self.assertEqual(var.inplace_version, 0) + + var.fill_(0) + self.assertEqual(var.inplace_version, 1) + + var.fill_(0) + self.assertEqual(var.inplace_version, 2) + + var.fill_(0) + self.assertEqual(var.inplace_version, 3) + + def test_fill_any_eqaul(self): + with paddle.fluid.dygraph.guard(): + tensor = paddle.to_tensor( + np.random.random((20, 30)).astype(np.float32)) + target = tensor.numpy() + target[...] = 1 + + tensor.fill_(1) + self.assertEqual((tensor.numpy() == target).all().item(), True) + + def test_backward(self): + with paddle.fluid.dygraph.guard(): + x = paddle.full([10, 10], -1., dtype='float32') + x.stop_gradient = False + y = 2 * x + y.fill_(1) + y.backward() + np.testing.assert_array_equal(x.grad.numpy(), np.zeros([10, 10])) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py index bd87181ebcc91..945bf11c53383 100644 --- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py +++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py @@ -127,7 +127,7 @@ def check_with_place(self, place): result_array = np.array(out.get_tensor()) full_array = np.full((123, 92), 3.8, 'float32') - self.assertTrue(np.array_equal(result_array, full_array)) + np.testing.assert_array_equal(result_array, full_array) def test_fill_constant_with_selected_rows(self): places = [core.CPUPlace()] diff --git a/python/paddle/fluid/tests/unittests/test_fill_diagonal_tensor_op.py b/python/paddle/fluid/tests/unittests/test_fill_diagonal_tensor_op.py index c1a187d7bbaaf..a35dd611cb2e9 100644 --- a/python/paddle/fluid/tests/unittests/test_fill_diagonal_tensor_op.py +++ b/python/paddle/fluid/tests/unittests/test_fill_diagonal_tensor_op.py @@ -86,6 +86,7 @@ class TensorFillDiagTensor_Test(OpTest): def setUp(self): self.op_type = "fill_diagonal_tensor" + self.python_api = paddle.tensor.manipulation.fill_diagonal_tensor self.init_kernel_type() x = np.random.random((10, 10)).astype(self.dtype) y = np.random.random((10, )).astype(self.dtype) @@ -96,22 +97,23 @@ def setUp(self): self.inputs = {"X": x, "Y": y} self.outputs = {'Out': out} - self.attrs = {"dim1": dim1, "dim2": dim2, "offset": offset} + self.attrs = {"offset": offset, "dim1": dim1, "dim2": dim2} def init_kernel_type(self): self.dtype = np.float64 def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) class TensorFillDiagTensor_Test2(TensorFillDiagTensor_Test): def setUp(self): self.op_type = "fill_diagonal_tensor" + self.python_api = paddle.tensor.manipulation.fill_diagonal_tensor self.init_kernel_type() x = np.random.random((2, 20, 25)).astype(self.dtype) y = np.random.random((2, 20)).astype(self.dtype) @@ -122,7 +124,7 @@ def setUp(self): self.inputs = {"X": x, "Y": y} self.outputs = {'Out': out} - self.attrs = {"dim1": dim1, "dim2": dim2, "offset": offset} + self.attrs = {"offset": offset, "dim1": dim1, "dim2": dim2} def init_kernel_type(self): self.dtype = np.float32 @@ -132,6 +134,7 @@ class TensorFillDiagTensor_Test3(TensorFillDiagTensor_Test): def setUp(self): self.op_type = "fill_diagonal_tensor" + self.python_api = paddle.tensor.manipulation.fill_diagonal_tensor self.init_kernel_type() x = np.random.random((2, 20, 20, 3)).astype(self.dtype) y = np.random.random((2, 3, 18)).astype(self.dtype) @@ -142,11 +145,12 @@ def setUp(self): self.inputs = {"X": x, "Y": y} self.outputs = {'Out': out} - self.attrs = {"dim1": dim1, "dim2": dim2, "offset": offset} + self.attrs = {"offset": offset, "dim1": dim1, "dim2": dim2} def init_kernel_type(self): self.dtype = np.float16 if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fill_op.py b/python/paddle/fluid/tests/unittests/test_fill_op.py index fdf4ec85627d5..29cf097acc016 100644 --- a/python/paddle/fluid/tests/unittests/test_fill_op.py +++ b/python/paddle/fluid/tests/unittests/test_fill_op.py @@ -78,7 +78,7 @@ def check_with_place(self, place, f_cpu): result_array = np.array(out) full_array = np.array(val, 'float32') - self.assertTrue(np.array_equal(result_array, full_array)) + np.testing.assert_array_equal(result_array, full_array) def test_fill_op(self): places = [core.CPUPlace()] diff --git a/python/paddle/fluid/tests/unittests/test_fold_op.py b/python/paddle/fluid/tests/unittests/test_fold_op.py index a919cac6b7d94..fc873cda95bb8 100644 --- a/python/paddle/fluid/tests/unittests/test_fold_op.py +++ b/python/paddle/fluid/tests/unittests/test_fold_op.py @@ -91,13 +91,14 @@ def set_data(self): def setUp(self): self.op_type = 'fold' + self.python_api = paddle.nn.functional.fold self.set_data() def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Y') + self.check_grad(['X'], 'Y', check_eager=True) class TestFoldAPI(TestFoldOp): @@ -106,6 +107,7 @@ class TestFoldAPI(TestFoldOp): def setUp(self): self.op_type = 'fold' + self.python_api = paddle.nn.functional.fold self.set_data() self.places = [fluid.CPUPlace()] if core.is_compiled_with_cuda(): diff --git a/python/paddle/fluid/tests/unittests/test_fused_matmul_bias.py b/python/paddle/fluid/tests/unittests/test_fused_matmul_bias.py index f2f56e42543c6..b26b5e2f04634 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_matmul_bias.py +++ b/python/paddle/fluid/tests/unittests/test_fused_matmul_bias.py @@ -97,19 +97,19 @@ def rand_test_base(self, m, n, k, trans_x, trans_y, need_bias, dtype, seed): z = fused_matmul_bias(x, y, bias, trans_x, trans_y) z_np = matmul(x_np, y_np, bias_np, trans_x, trans_y) - self.assertTrue(np.array_equal(z.numpy(), z_np)) + np.testing.assert_array_equal(z.numpy(), z_np) z_grad_np = self.rand_data(z_np.shape, dtype) paddle.autograd.backward(z, grad_tensors=[paddle.to_tensor(z_grad_np)]) x_grad_np, y_grad_np, bias_grad_np = matmul_grad( x_np, y_np, bias_np, z_grad_np, trans_x, trans_y) - self.assertTrue(np.array_equal(x.grad.numpy(), x_grad_np)) + np.testing.assert_array_equal(x.grad.numpy(), x_grad_np) self.assertEqual(y_grad_np.shape, y_np.shape) - self.assertTrue(np.array_equal(y.grad.numpy(), y_grad_np)) + np.testing.assert_array_equal(y.grad.numpy(), y_grad_np) if need_bias: - self.assertTrue(np.array_equal(bias.grad.numpy(), bias_grad_np)) + np.testing.assert_array_equal(bias.grad.numpy(), bias_grad_np) else: self.assertTrue(bias_grad_np is None) @@ -138,7 +138,7 @@ def check_fused_linear(self, transpose): linear = FusedLinear(40, 50, transpose_weight=transpose) y1 = linear(x) y2 = fused_linear(x, linear.weight, linear.bias, transpose) - self.assertTrue(np.array_equal(y1.numpy(), y2.numpy())) + np.testing.assert_array_equal(y1.numpy(), y2.numpy()) def test_non_transpose(self): self.check_fused_linear(False) diff --git a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py index ecfc8a5bc292c..65276e9c92e96 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py +++ b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py @@ -548,5 +548,60 @@ def config(self): self.layers = 3 # odd layers +class TestFusedMultiTransformerOpPostLayerNormFp16(TestFusedMultiTransformerOp): + + def config(self): + super().config() + self.x_type = np.float16 + self.layers = 3 # odd layers + self.pre_layer_norm = False + + +class TestFusedMultiTransformerOpCacheKVPostLayerNorm( + TestFusedMultiTransformerOp): + + def config(self): + super().config() + self.has_cache_kv = True + self.query_length = 1 + self.key_length, self.value_length = 1, 1 + self.layers = 3 # odd layers + self.pre_layer_norm = False + + +class TestFusedMultiTransformerOpCacheKVPostLayerNormFp16( + TestFusedMultiTransformerOp): + + def config(self): + super().config() + self.has_cache_kv = True + self.query_length = 1 + self.key_length, self.value_length = 1, 1 + self.x_type = np.float16 + self.pre_layer_norm = False + + +class TestFusedMultiTransformerOpGenCacheKVPostLayerNorm( + TestFusedMultiTransformerOp): + + def config(self): + super().config() + self.has_cache_kv = True + self.gen_cache_kv = True + self.pre_layer_norm = False + + +class TestFusedMultiTransformerOpGenCacheKVPostLayerNormFp16( + TestFusedMultiTransformerOp): + + def config(self): + super().config() + self.has_cache_kv = True + self.gen_cache_kv = True + self.x_type = np.float16 + self.layers = 3 # odd layers + self.pre_layer_norm = False + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py index f76094c92eb8a..6a0fdc4ff61ea 100644 --- a/python/paddle/fluid/tests/unittests/test_gather_op.py +++ b/python/paddle/fluid/tests/unittests/test_gather_op.py @@ -341,7 +341,7 @@ def test_static_graph(): gpu_value = gpu_exe.run(feed=feed, fetch_list=fetch)[0] return gpu_value - self.assertTrue(np.array_equal(test_dygraph(), test_static_graph())) + np.testing.assert_array_equal(test_dygraph(), test_static_graph()) class TestGathertError(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py index b1a4b45d7d257..506176d146c57 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py @@ -26,6 +26,36 @@ from test_generate_proposals_op import clip_tiled_boxes, box_coder, nms +def python_generate_proposals_v2( + scores, + bbox_deltas, + img_size, + anchors, + variances, + pre_nms_top_n=6000, + post_nms_top_n=1000, + nms_thresh=0.5, + min_size=0.1, + eta=1.0, + pixel_offset=False, + return_rois_num=True, +): + rpn_rois, rpn_roi_probs, rpn_rois_num = paddle.vision.ops.generate_proposals( + scores, + bbox_deltas, + img_size, + anchors, + variances, + pre_nms_top_n=pre_nms_top_n, + post_nms_top_n=post_nms_top_n, + nms_thresh=nms_thresh, + min_size=min_size, + eta=eta, + pixel_offset=pixel_offset, + return_rois_num=return_rois_num) + return rpn_rois, rpn_roi_probs + + def generate_proposals_v2_in_python(scores, bbox_deltas, im_shape, anchors, variances, pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta, pixel_offset): @@ -155,15 +185,16 @@ def set_data(self): } self.outputs = { - 'RpnRois': (self.rpn_rois[0], [self.rois_num]), - 'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]), + 'RpnRois': self.rpn_rois[0], + 'RpnRoiProbs': self.rpn_roi_probs[0], } def test_check_output(self): - self.check_output() + self.check_output(check_eager=False) def setUp(self): self.op_type = "generate_proposals_v2" + self.python_api = python_generate_proposals_v2 self.set_data() def init_test_params(self): @@ -202,150 +233,117 @@ def init_test_output(self): self.nms_thresh, self.min_size, self.eta, self.pixel_offset) -class TestGenerateProposalsV2OutLodOp(TestGenerateProposalsV2Op): - - def set_data(self): - self.init_test_params() - self.init_test_input() - self.init_test_output() - self.inputs = { - 'Scores': self.scores, - 'BboxDeltas': self.bbox_deltas, - 'ImShape': self.im_shape.astype(np.float32), - 'Anchors': self.anchors, - 'Variances': self.variances - } - - self.attrs = { - 'pre_nms_topN': self.pre_nms_topN, - 'post_nms_topN': self.post_nms_topN, - 'nms_thresh': self.nms_thresh, - 'min_size': self.min_size, - 'eta': self.eta, - 'return_rois_num': True - } - - self.outputs = { - 'RpnRois': (self.rpn_rois[0], [self.rois_num]), - 'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]), - 'RpnRoisNum': (np.asarray(self.rois_num, dtype=np.int32)) - } - - -class TestGenerateProposalsV2OpNoBoxLeft(TestGenerateProposalsV2Op): - - def init_test_params(self): - self.pre_nms_topN = 12000 # train 12000, test 2000 - self.post_nms_topN = 5000 # train 6000, test 1000 - self.nms_thresh = 0.7 - self.min_size = 1000.0 - self.eta = 1. - self.pixel_offset = True - - -class TestGenerateProposalsV2OpNoOffset(TestGenerateProposalsV2Op): - - def init_test_params(self): - self.pre_nms_topN = 12000 # train 12000, test 2000 - self.post_nms_topN = 5000 # train 6000, test 1000 - self.nms_thresh = 0.7 - self.min_size = 3.0 - self.eta = 1. - self.pixel_offset = False - - -class testGenerateProposalsAPI(unittest.TestCase): - - def setUp(self): - np.random.seed(678) - self.scores_np = np.random.rand(2, 3, 4, 4).astype('float32') - self.bbox_deltas_np = np.random.rand(2, 12, 4, 4).astype('float32') - self.img_size_np = np.array([[8, 8], [6, 6]]).astype('float32') - self.anchors_np = np.reshape(np.arange(4 * 4 * 3 * 4), - [4, 4, 3, 4]).astype('float32') - self.variances_np = np.ones((4, 4, 3, 4)).astype('float32') - - self.roi_expected, self.roi_probs_expected, self.rois_num_expected = generate_proposals_v2_in_python( - self.scores_np, - self.bbox_deltas_np, - self.img_size_np, - self.anchors_np, - self.variances_np, - pre_nms_topN=10, - post_nms_topN=5, - nms_thresh=0.5, - min_size=0.1, - eta=1.0, - pixel_offset=False) - self.roi_expected = np.array(self.roi_expected).squeeze(1) - self.roi_probs_expected = np.array(self.roi_probs_expected).squeeze(1) - self.rois_num_expected = np.array(self.rois_num_expected) - - def test_dynamic(self): - paddle.disable_static() - scores = paddle.to_tensor(self.scores_np) - bbox_deltas = paddle.to_tensor(self.bbox_deltas_np) - img_size = paddle.to_tensor(self.img_size_np) - anchors = paddle.to_tensor(self.anchors_np) - variances = paddle.to_tensor(self.variances_np) - - rois, roi_probs, rois_num = paddle.vision.ops.generate_proposals( - scores, - bbox_deltas, - img_size, - anchors, - variances, - pre_nms_top_n=10, - post_nms_top_n=5, - return_rois_num=True) - self.assertTrue(np.allclose(self.roi_expected, rois.numpy())) - self.assertTrue(np.allclose(self.roi_probs_expected, roi_probs.numpy())) - self.assertTrue(np.allclose(self.rois_num_expected, rois_num.numpy())) - - def test_static(self): - paddle.enable_static() - scores = paddle.static.data(name='scores', - shape=[2, 3, 4, 4], - dtype='float32') - bbox_deltas = paddle.static.data(name='bbox_deltas', - shape=[2, 12, 4, 4], - dtype='float32') - img_size = paddle.static.data(name='img_size', - shape=[2, 2], - dtype='float32') - anchors = paddle.static.data(name='anchors', - shape=[4, 4, 3, 4], - dtype='float32') - variances = paddle.static.data(name='variances', - shape=[4, 4, 3, 4], - dtype='float32') - rois, roi_probs, rois_num = paddle.vision.ops.generate_proposals( - scores, - bbox_deltas, - img_size, - anchors, - variances, - pre_nms_top_n=10, - post_nms_top_n=5, - return_rois_num=True) - exe = paddle.static.Executor() - rois, roi_probs, rois_num = exe.run( - paddle.static.default_main_program(), - feed={ - 'scores': self.scores_np, - 'bbox_deltas': self.bbox_deltas_np, - 'img_size': self.img_size_np, - 'anchors': self.anchors_np, - 'variances': self.variances_np, - }, - fetch_list=[rois.name, roi_probs.name, rois_num.name], - return_numpy=False) - - self.assertTrue(np.allclose(self.roi_expected, np.array(rois))) - self.assertTrue( - np.allclose(self.roi_probs_expected, np.array(roi_probs))) - self.assertTrue(np.allclose(self.rois_num_expected, np.array(rois_num))) - +# class TestGenerateProposalsV2OpNoBoxLeft(TestGenerateProposalsV2Op): + +# def init_test_params(self): +# self.pre_nms_topN = 12000 # train 12000, test 2000 +# self.post_nms_topN = 5000 # train 6000, test 1000 +# self.nms_thresh = 0.7 +# self.min_size = 1000.0 +# self.eta = 1. +# self.pixel_offset = True + +# class TestGenerateProposalsV2OpNoOffset(TestGenerateProposalsV2Op): + +# def init_test_params(self): +# self.pre_nms_topN = 12000 # train 12000, test 2000 +# self.post_nms_topN = 5000 # train 6000, test 1000 +# self.nms_thresh = 0.7 +# self.min_size = 3.0 +# self.eta = 1. +# self.pixel_offset = False + +# class testGenerateProposalsAPI(unittest.TestCase): + +# def setUp(self): +# np.random.seed(678) +# self.scores_np = np.random.rand(2, 3, 4, 4).astype('float32') +# self.bbox_deltas_np = np.random.rand(2, 12, 4, 4).astype('float32') +# self.img_size_np = np.array([[8, 8], [6, 6]]).astype('float32') +# self.anchors_np = np.reshape(np.arange(4 * 4 * 3 * 4), +# [4, 4, 3, 4]).astype('float32') +# self.variances_np = np.ones((4, 4, 3, 4)).astype('float32') + +# self.roi_expected, self.roi_probs_expected, self.rois_num_expected = generate_proposals_v2_in_python( +# self.scores_np, +# self.bbox_deltas_np, +# self.img_size_np, +# self.anchors_np, +# self.variances_np, +# pre_nms_topN=10, +# post_nms_topN=5, +# nms_thresh=0.5, +# min_size=0.1, +# eta=1.0, +# pixel_offset=False) +# self.roi_expected = np.array(self.roi_expected).squeeze(1) +# self.roi_probs_expected = np.array(self.roi_probs_expected).squeeze(1) +# self.rois_num_expected = np.array(self.rois_num_expected) + +# def test_dynamic(self): +# paddle.disable_static() +# scores = paddle.to_tensor(self.scores_np) +# bbox_deltas = paddle.to_tensor(self.bbox_deltas_np) +# img_size = paddle.to_tensor(self.img_size_np) +# anchors = paddle.to_tensor(self.anchors_np) +# variances = paddle.to_tensor(self.variances_np) + +# rois, roi_probs, rois_num = paddle.vision.ops.generate_proposals( +# scores, +# bbox_deltas, +# img_size, +# anchors, +# variances, +# pre_nms_top_n=10, +# post_nms_top_n=5, +# return_rois_num=True) +# self.assertTrue(np.allclose(self.roi_expected, rois.numpy())) +# self.assertTrue(np.allclose(self.roi_probs_expected, roi_probs.numpy())) +# self.assertTrue(np.allclose(self.rois_num_expected, rois_num.numpy())) + +# def test_static(self): +# paddle.enable_static() +# scores = paddle.static.data(name='scores', +# shape=[2, 3, 4, 4], +# dtype='float32') +# bbox_deltas = paddle.static.data(name='bbox_deltas', +# shape=[2, 12, 4, 4], +# dtype='float32') +# img_size = paddle.static.data(name='img_size', +# shape=[2, 2], +# dtype='float32') +# anchors = paddle.static.data(name='anchors', +# shape=[4, 4, 3, 4], +# dtype='float32') +# variances = paddle.static.data(name='variances', +# shape=[4, 4, 3, 4], +# dtype='float32') +# rois, roi_probs, rois_num = paddle.vision.ops.generate_proposals( +# scores, +# bbox_deltas, +# img_size, +# anchors, +# variances, +# pre_nms_top_n=10, +# post_nms_top_n=5, +# return_rois_num=True) +# exe = paddle.static.Executor() +# rois, roi_probs, rois_num = exe.run( +# paddle.static.default_main_program(), +# feed={ +# 'scores': self.scores_np, +# 'bbox_deltas': self.bbox_deltas_np, +# 'img_size': self.img_size_np, +# 'anchors': self.anchors_np, +# 'variances': self.variances_np, +# }, +# fetch_list=[rois.name, roi_probs.name, rois_num.name], +# return_numpy=False) + +# self.assertTrue(np.allclose(self.roi_expected, np.array(rois))) +# self.assertTrue( +# np.allclose(self.roi_probs_expected, np.array(roi_probs))) +# self.assertTrue(np.allclose(self.rois_num_expected, np.array(rois_num))) if __name__ == '__main__': paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py index c0fdb134f16d6..73c1525519066 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py @@ -28,8 +28,8 @@ def graph_send_recv_wrapper(x, pool_type="sum", out_size=None, name=None): - return paddle.incubate.graph_send_recv(x, src_index, dst_index, - pool_type.lower(), out_size, name) + return paddle.geometric.send_u_recv(x, src_index, dst_index, + pool_type.lower(), out_size, name) class TestGraphSendRecvMaxOp(OpTest): @@ -268,20 +268,143 @@ def test_static(self): {}\n{}, check diff!".format(np_res, ret_res)) def test_dygraph(self): - device = paddle.CPUPlace() - with paddle.fluid.dygraph.guard(device): - x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), - dtype="float32") - src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") - dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") + paddle.disable_static() + x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), + dtype="float32") + src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") + dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") + res_sum = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "sum") + res_mean = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "mean") + res_max = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "max") + res_min = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "min") + + np_sum = np.array([[0, 2, 3], [2, 8, 10], [1, 4, 5]], dtype="float32") + np_mean = np.array([[0, 2, 3], [1, 4, 5], [1, 4, 5]], dtype="float32") + np_max = np.array([[0, 2, 3], [2, 6, 7], [1, 4, 5]], dtype="float32") + np_min = np.array([[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="float32") + + ret = [res_sum, res_mean, res_max, res_min] + + for np_res, ret_res in zip([np_sum, np_mean, np_max, np_min], ret): + self.assertTrue( + np.allclose(np_res, ret_res, atol=1e-6), "two value is\ + {}\n{}, check diff!".format(np_res, ret_res)) + + def test_int32_input(self): + paddle.disable_static() + x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]]), + dtype="int32") + src_index = paddle.to_tensor(np.array([0, 1, 2, 0, 1]), dtype="int32") + dst_index = paddle.to_tensor(np.array([1, 2, 1, 0, 1]), dtype="int32") + res_sum = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "sum") + res_mean = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "mean") + res_max = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "max") + res_min = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "min") + + np_sum = np.array([[0, 2, 3], [3, 12, 14], [1, 4, 5]], dtype="int32") + np_mean = np.array([[0, 2, 3], [1, 4, 4], [1, 4, 5]], dtype="int32") + np_max = np.array([[0, 2, 3], [2, 6, 6], [1, 4, 5]], dtype="int32") + np_min = np.array([[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="int32") + + ret = [res_sum, res_mean, res_max, res_min] + + for np_res, ret_res in zip([np_sum, np_mean, np_max, np_min], ret): + self.assertTrue( + np.allclose(np_res, ret_res, atol=1e-6), "two value is\ + {}\n{}, check diff!".format(np_res, ret_res)) + + def test_set_outsize_gpu(self): + paddle.disable_static() + x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]]), + dtype="float32") + src_index = paddle.to_tensor(np.array([0, 0, 1]), dtype="int32") + dst_index = paddle.to_tensor(np.array([0, 1, 1]), dtype="int32") + res = paddle.incubate.graph_send_recv(x, src_index, dst_index, "sum") + out_size = paddle.max(dst_index) + 1 + res_set_outsize = paddle.incubate.graph_send_recv( + x, src_index, dst_index, "sum", out_size) + + np_res = np.array([[0, 2, 3], [1, 6, 8], [0, 0, 0]], dtype="float32") + np_res_set_outsize = np.array([[0, 2, 3], [1, 6, 8]], dtype="float32") + + self.assertTrue( + np.allclose(np_res, res, atol=1e-6), "two value is\ + {}\n{}, check diff!".format(np_res, res)) + self.assertTrue( + np.allclose(np_res_set_outsize, res_set_outsize, atol=1e-6), + "two value is\ + {}\n{}, check diff!".format(np_res_set_outsize, + res_set_outsize)) + + def test_out_size_tensor_static(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data(name="x", shape=[3, 3], dtype="float32") + src_index = paddle.static.data(name="src", shape=[3], dtype="int32") + dst_index = paddle.static.data(name="dst", shape=[3], dtype="int32") + out_size = paddle.static.data(name="out_size", + shape=[1], + dtype="int32") + res_sum = paddle.incubate.graph_send_recv(x, src_index, dst_index, - "sum") - res_mean = paddle.incubate.graph_send_recv(x, src_index, dst_index, - "mean") - res_max = paddle.incubate.graph_send_recv(x, src_index, dst_index, - "max") - res_min = paddle.incubate.graph_send_recv(x, src_index, dst_index, - "min") + "sum", out_size) + + exe = paddle.static.Executor(paddle.CPUPlace()) + data1 = np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]], dtype='float32') + data2 = np.array([0, 0, 1], dtype="int32") + data3 = np.array([0, 1, 1], dtype="int32") + data4 = np.array([2], dtype="int32") + + np_sum = np.array([[0, 2, 3], [1, 6, 8]], dtype="float32") + + ret = exe.run(feed={ + 'x': data1, + 'src': data2, + 'dst': data3, + 'out_size': data4, + }, + fetch_list=[res_sum]) + self.assertTrue( + np.allclose(np_sum, ret[0], atol=1e-6), "two value is\ + {}\n{}, check diff!".format(np_sum, ret[0])) + + def test_api_eager_dygraph(self): + with _test_eager_guard(): + self.test_dygraph() + self.test_int32_input() + self.test_set_outsize_gpu() + + +class API_GeometricSendURecvTest(unittest.TestCase): + + def test_static(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data(name="x", shape=[3, 3], dtype="float32") + src_index = paddle.static.data(name="src", shape=[4], dtype="int32") + dst_index = paddle.static.data(name="dst", shape=[4], dtype="int32") + + res_sum = paddle.geometric.send_u_recv(x, src_index, dst_index, + "sum") + res_mean = paddle.geometric.send_u_recv(x, src_index, dst_index, + "mean") + res_max = paddle.geometric.send_u_recv(x, src_index, dst_index, + "max") + res_min = paddle.geometric.send_u_recv(x, src_index, dst_index, + "min") + + exe = paddle.static.Executor(paddle.CPUPlace()) + data1 = np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype='float32') + data2 = np.array([0, 1, 2, 0], dtype="int32") + data3 = np.array([1, 2, 1, 0], dtype="int32") np_sum = np.array([[0, 2, 3], [2, 8, 10], [1, 4, 5]], dtype="float32") @@ -292,38 +415,58 @@ def test_dygraph(self): np_min = np.array([[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="float32") - ret = [res_sum, res_mean, res_max, res_min] + ret = exe.run(feed={ + 'x': data1, + 'src': data2, + 'dst': data3 + }, + fetch_list=[res_sum, res_mean, res_max, res_min]) for np_res, ret_res in zip([np_sum, np_mean, np_max, np_min], ret): self.assertTrue( np.allclose(np_res, ret_res, atol=1e-6), "two value is\ {}\n{}, check diff!".format(np_res, ret_res)) - def test_int32_input(self): - device = paddle.CPUPlace() - with paddle.fluid.dygraph.guard(device): - x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]]), - dtype="int32") - src_index = paddle.to_tensor(np.array([0, 1, 2, 0, 1]), - dtype="int32") - dst_index = paddle.to_tensor(np.array([1, 2, 1, 0, 1]), - dtype="int32") - res_sum = paddle.incubate.graph_send_recv(x, src_index, dst_index, - "sum") - res_mean = paddle.incubate.graph_send_recv(x, src_index, dst_index, - "mean") - res_max = paddle.incubate.graph_send_recv(x, src_index, dst_index, - "max") - res_min = paddle.incubate.graph_send_recv(x, src_index, dst_index, - "min") + def test_dygraph(self): + paddle.disable_static() + x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), + dtype="float32") + src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") + dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") + res_sum = paddle.geometric.send_u_recv(x, src_index, dst_index, "sum") + res_mean = paddle.geometric.send_u_recv(x, src_index, dst_index, "mean") + res_max = paddle.geometric.send_u_recv(x, src_index, dst_index, "max") + res_min = paddle.geometric.send_u_recv(x, src_index, dst_index, "min") + + np_sum = np.array([[0, 2, 3], [2, 8, 10], [1, 4, 5]], dtype="float32") + np_mean = np.array([[0, 2, 3], [1, 4, 5], [1, 4, 5]], dtype="float32") + np_max = np.array([[0, 2, 3], [2, 6, 7], [1, 4, 5]], dtype="float32") + np_min = np.array([[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="float32") + + ret = [res_sum, res_mean, res_max, res_min] - np_sum = np.array([[0, 2, 3], [3, 12, 14], [1, 4, 5]], - dtype="int32") - np_mean = np.array([[0, 2, 3], [1, 4, 4], [1, 4, 5]], dtype="int32") - np_max = np.array([[0, 2, 3], [2, 6, 6], [1, 4, 5]], dtype="int32") - np_min = np.array([[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="int32") + for np_res, ret_res in zip([np_sum, np_mean, np_max, np_min], ret): + self.assertTrue( + np.allclose(np_res, ret_res, atol=1e-6), "two value is\ + {}\n{}, check diff!".format(np_res, ret_res)) - ret = [res_sum, res_mean, res_max, res_min] + def test_int32_input(self): + paddle.disable_static() + x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]]), + dtype="int32") + src_index = paddle.to_tensor(np.array([0, 1, 2, 0, 1]), dtype="int32") + dst_index = paddle.to_tensor(np.array([1, 2, 1, 0, 1]), dtype="int32") + res_sum = paddle.geometric.send_u_recv(x, src_index, dst_index, "sum") + res_mean = paddle.geometric.send_u_recv(x, src_index, dst_index, "mean") + res_max = paddle.geometric.send_u_recv(x, src_index, dst_index, "max") + res_min = paddle.geometric.send_u_recv(x, src_index, dst_index, "min") + + np_sum = np.array([[0, 2, 3], [3, 12, 14], [1, 4, 5]], dtype="int32") + np_mean = np.array([[0, 2, 3], [1, 4, 4], [1, 4, 5]], dtype="int32") + np_max = np.array([[0, 2, 3], [2, 6, 6], [1, 4, 5]], dtype="int32") + np_min = np.array([[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="int32") + + ret = [res_sum, res_mean, res_max, res_min] for np_res, ret_res in zip([np_sum, np_mean, np_max, np_min], ret): self.assertTrue( @@ -331,31 +474,60 @@ def test_int32_input(self): {}\n{}, check diff!".format(np_res, ret_res)) def test_set_outsize_gpu(self): - if paddle.fluid.core.is_compiled_with_cuda(): - x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]]), - dtype="float32") - src_index = paddle.to_tensor(np.array([0, 0, 1]), dtype="int32") - dst_index = paddle.to_tensor(np.array([0, 1, 1]), dtype="int32") - res = paddle.incubate.graph_send_recv(x, src_index, dst_index, - "sum") - out_size = paddle.max(dst_index) + 1 - res_set_outsize = paddle.incubate.graph_send_recv( - x, src_index, dst_index, "sum", out_size) - - np_res = np.array([[0, 2, 3], [1, 6, 8], [0, 0, 0]], - dtype="float32") - np_res_set_outsize = np.array([[0, 2, 3], [1, 6, 8]], - dtype="float32") - - self.assertTrue( - np.allclose(np_res, res, atol=1e-6), "two value is\ + paddle.disable_static() + x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]]), + dtype="float32") + src_index = paddle.to_tensor(np.array([0, 0, 1]), dtype="int32") + dst_index = paddle.to_tensor(np.array([0, 1, 1]), dtype="int32") + res = paddle.geometric.send_u_recv(x, src_index, dst_index, "sum") + out_size = paddle.max(dst_index) + 1 + res_set_outsize = paddle.geometric.send_u_recv(x, src_index, dst_index, + "sum", out_size) + + np_res = np.array([[0, 2, 3], [1, 6, 8], [0, 0, 0]], dtype="float32") + np_res_set_outsize = np.array([[0, 2, 3], [1, 6, 8]], dtype="float32") + + self.assertTrue( + np.allclose(np_res, res, atol=1e-6), "two value is\ {}\n{}, check diff!".format(np_res, res)) - self.assertTrue( - np.allclose(np_res_set_outsize, res_set_outsize, atol=1e-6), - "two value is\ + self.assertTrue( + np.allclose(np_res_set_outsize, res_set_outsize, atol=1e-6), + "two value is\ {}\n{}, check diff!".format(np_res_set_outsize, res_set_outsize)) + def test_out_size_tensor_static(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data(name="x", shape=[3, 3], dtype="float32") + src_index = paddle.static.data(name="src", shape=[3], dtype="int32") + dst_index = paddle.static.data(name="dst", shape=[3], dtype="int32") + out_size = paddle.static.data(name="out_size", + shape=[1], + dtype="int32") + + res_sum = paddle.geometric.send_u_recv(x, src_index, dst_index, + "sum", out_size) + + exe = paddle.static.Executor(paddle.CPUPlace()) + data1 = np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]], dtype='float32') + data2 = np.array([0, 0, 1], dtype="int32") + data3 = np.array([0, 1, 1], dtype="int32") + data4 = np.array([2], dtype="int32") + + np_sum = np.array([[0, 2, 3], [1, 6, 8]], dtype="float32") + + ret = exe.run(feed={ + 'x': data1, + 'src': data2, + 'dst': data3, + 'out_size': data4, + }, + fetch_list=[res_sum]) + self.assertTrue( + np.allclose(np_sum, ret[0], atol=1e-6), "two value is\ + {}\n{}, check diff!".format(np_sum, ret[0])) + def test_api_eager_dygraph(self): with _test_eager_guard(): self.test_dygraph() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py index d59cdc3e328e2..79194928f9d81 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py @@ -324,8 +324,8 @@ def nan_inf(self): for param in model.parameters(): # param not update when tensor contains nan or inf - self.assertTrue( - np.array_equal(param.numpy(), params_init[param.name])) + np.testing.assert_array_equal(param.numpy(), + params_init[param.name]) def test_nan_inf(self): self.nan_inf() @@ -974,7 +974,7 @@ def train(layer, loader, loss_fn, opt): fetch_list=fetch_targets) print("pred.numpy()", pred.numpy()) print("result", results[0]) - self.assertTrue(np.array_equal(pred.numpy(), results[0])) + np.testing.assert_array_equal(pred.numpy(), results[0]) paddle.disable_static() def test_inference_save_load(self): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py index 3b1a0436556b1..6a256ec108832 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py @@ -323,8 +323,8 @@ def nan_inf(self): for param in model.parameters(): # param not update when tensor contains nan or inf - self.assertTrue( - np.array_equal(param.numpy(), params_init[param.name])) + np.testing.assert_array_equal(param.numpy(), + params_init[param.name]) def test_nan_inf(self): self.nan_inf() @@ -965,7 +965,7 @@ def train(layer, loader, loss_fn, opt): fetch_list=fetch_targets) print("pred.numpy()", pred.numpy()) print("result", results[0]) - self.assertTrue(np.array_equal(pred.numpy(), results[0])) + np.testing.assert_array_equal(pred.numpy(), results[0]) paddle.disable_static() def test_inference_save_load(self): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py index 7a5934b4fdc79..0cd97cbf32887 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py @@ -314,8 +314,8 @@ def func_auto_prune8(self): learning_rate=0.003, parameter_list=(linear.parameters() + linear2.parameters())) optimizer.minimize(out2) - self.assertTrue( - np.array_equal(linear2_origin, linear2.weight.numpy())) + np.testing.assert_array_equal(linear2_origin, + linear2.weight.numpy()) self.assertFalse( np.array_equal(linear_origin, linear.weight.numpy())) @@ -344,10 +344,9 @@ def func_auto_prune9(self): learning_rate=0.003, parameter_list=(linear.parameters() + linear2.parameters())) optimizer.minimize(out2) - self.assertTrue( - np.array_equal(linear2_origin, linear2.weight.numpy())) - self.assertTrue(np.array_equal(linear_origin, - linear.weight.numpy())) + np.testing.assert_array_equal(linear2_origin, + linear2.weight.numpy()) + np.testing.assert_array_equal(linear_origin, linear.weight.numpy()) try: linear2.weight.gradient() except ValueError as e: diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index e67bae46a53a7..e1663ad400f46 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -159,8 +159,8 @@ def functional_dygraph_context(self): out.backward() dy_grad2 = mlp._linear1.weight.gradient() self.assertFalse(fluid.dygraph.enabled()) - self.assertTrue(np.array_equal(dy_out1, dy_out2)) - self.assertTrue(np.array_equal(dy_grad1, dy_grad2)) + np.testing.assert_array_equal(dy_out1, dy_out2) + np.testing.assert_array_equal(dy_grad1, dy_grad2) def test_functional_dygraph_context(self): with _test_eager_guard(): @@ -190,8 +190,8 @@ def functional_paddle_imperative_dygraph_context(self): dy_grad2 = mlp._linear1.weight.gradient() paddle.enable_static() self.assertFalse(paddle.in_dynamic_mode()) - self.assertTrue(np.array_equal(dy_out1, dy_out2)) - self.assertTrue(np.array_equal(dy_grad1, dy_grad2)) + np.testing.assert_array_equal(dy_out1, dy_out2) + np.testing.assert_array_equal(dy_grad1, dy_grad2) def test_functional_paddle_imperative_dygraph_context(self): with _test_eager_guard(): @@ -229,12 +229,12 @@ def func_create_varbase(self): egr_tmp5 = fluid.core.eager.Tensor(value=x) egr_tmp6 = fluid.core.eager.Tensor(t) - self.assertTrue(np.array_equal(x, egr_tmp.numpy())) - self.assertTrue(np.array_equal(y, egr_tmp2.numpy())) - self.assertTrue(np.array_equal(x, egr_tmp3.numpy())) - self.assertTrue(np.array_equal(y, egr_tmp4.numpy())) - self.assertTrue(np.array_equal(x, egr_tmp5.numpy())) - self.assertTrue(np.array_equal(x, egr_tmp6.numpy())) + np.testing.assert_array_equal(x, egr_tmp.numpy()) + np.testing.assert_array_equal(y, egr_tmp2.numpy()) + np.testing.assert_array_equal(x, egr_tmp3.numpy()) + np.testing.assert_array_equal(y, egr_tmp4.numpy()) + np.testing.assert_array_equal(x, egr_tmp5.numpy()) + np.testing.assert_array_equal(x, egr_tmp6.numpy()) else: tmp = fluid.core.VarBase(value=x, place=fluid.core.CPUPlace()) tmp2 = fluid.core.VarBase(y, fluid.core.CPUPlace()) @@ -243,12 +243,12 @@ def func_create_varbase(self): tmp5 = fluid.core.VarBase(value=x) tmp6 = fluid.core.VarBase(t) - self.assertTrue(np.array_equal(x, tmp.numpy())) - self.assertTrue(np.array_equal(y, tmp2.numpy())) - self.assertTrue(np.array_equal(x, tmp3.numpy())) - self.assertTrue(np.array_equal(y, tmp4.numpy())) - self.assertTrue(np.array_equal(x, tmp5.numpy())) - self.assertTrue(np.array_equal(x, tmp6.numpy())) + np.testing.assert_array_equal(x, tmp.numpy()) + np.testing.assert_array_equal(y, tmp2.numpy()) + np.testing.assert_array_equal(x, tmp3.numpy()) + np.testing.assert_array_equal(y, tmp4.numpy()) + np.testing.assert_array_equal(x, tmp5.numpy()) + np.testing.assert_array_equal(x, tmp6.numpy()) def test_create_varbase(self): with fluid.dygraph.guard(): @@ -479,10 +479,10 @@ def func_layer_in_out(self): feed={inp.name: np_inp}, fetch_list=[x.name, param_grads[1].name]) - self.assertTrue(np.array_equal(dy_out, static_out)) - self.assertTrue(np.array_equal(dy_grad, static_grad)) - self.assertTrue(np.array_equal(dy_out2, static_out)) - self.assertTrue(np.array_equal(dy_grad2, static_grad)) + np.testing.assert_array_equal(dy_out, static_out) + np.testing.assert_array_equal(dy_grad, static_grad) + np.testing.assert_array_equal(dy_out2, static_out) + np.testing.assert_array_equal(dy_grad2, static_grad) def test_layer_in_out(self): fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) @@ -577,9 +577,9 @@ def fun(x, y, z): loss2 = x * z loss1.backward(retain_graph=True) loss2.backward(retain_graph=True) - self.assertTrue(np.array_equal(x.grad.numpy(), [23.])) - self.assertTrue(np.array_equal(y.grad.numpy(), [25.])) - self.assertTrue(np.array_equal(z.grad.numpy(), [5.])) + np.testing.assert_array_equal(x.grad.numpy(), [23.0]) + np.testing.assert_array_equal(y.grad.numpy(), [25.0]) + np.testing.assert_array_equal(z.grad.numpy(), [5.0]) x.clear_grad() y.clear_grad() z.clear_grad() @@ -592,13 +592,13 @@ def fun(x, y, z): loss = fun(x, y, z) loss.backward(retain_graph=True) # x.grad = 2*x*y + z + 2*y = 27 - self.assertTrue(np.array_equal(x.grad.numpy(), [27])) + np.testing.assert_array_equal(x.grad.numpy(), [27]) loss.backward(retain_graph=True) - self.assertTrue(np.array_equal(x.grad.numpy(), [54])) + np.testing.assert_array_equal(x.grad.numpy(), [54]) loss.backward() - self.assertTrue(np.array_equal(x.grad.numpy(), [81])) + np.testing.assert_array_equal(x.grad.numpy(), [81]) with self.assertRaises(RuntimeError): loss.backward() @@ -608,8 +608,8 @@ def fun(x, y, z): dx = paddle.grad([loss1], x, create_graph=True)[0] loss = loss1 + loss2 + dx loss.backward() - self.assertTrue(np.array_equal(dx.grad.numpy(), [1])) - self.assertTrue(np.array_equal(x.grad.numpy(), [108])) + np.testing.assert_array_equal(dx.grad.numpy(), [1]) + np.testing.assert_array_equal(x.grad.numpy(), [108]) def test_mlp(sort_sum_gradient): fluid.set_flags({'FLAGS_sort_sum_gradient': sort_sum_gradient}) @@ -641,7 +641,7 @@ def test_mlp(sort_sum_gradient): loss = mlp1(x) loss.backward() - self.assertTrue(np.array_equal(loss.grad.numpy(), [1])) + np.testing.assert_array_equal(loss.grad.numpy(), [1]) self.assertTrue( np.allclose(mlp1._linear1.weight.grad.numpy(), expected_weight1_grad)) @@ -656,7 +656,7 @@ def test_mlp(sort_sum_gradient): expected_bias2_grad)) mlp2.clear_gradients() - self.assertTrue(np.array_equal(clear_loss.grad.numpy(), [1])) + np.testing.assert_array_equal(clear_loss.grad.numpy(), [1]) if ((batch_id + 1) % 10) % 2 == 0: mlp1.clear_gradients() expected_weight1_grad = 0. @@ -785,14 +785,14 @@ def func_rnn(self): param_grads[1][1].name, param_grads[2][1].name ]) - self.assertTrue(np.array_equal(dy_out, static_out)) - self.assertTrue(np.array_equal(dy_grad_h2o, static_grad_h2o)) - self.assertTrue(np.array_equal(dy_grad_h2h, static_grad_h2h)) - self.assertTrue(np.array_equal(dy_grad_i2h, static_grad_i2h)) - self.assertTrue(np.array_equal(dy_out2, static_out)) - self.assertTrue(np.array_equal(dy_grad_h2o2, static_grad_h2o)) - self.assertTrue(np.array_equal(dy_grad_h2h2, static_grad_h2h)) - self.assertTrue(np.array_equal(dy_grad_i2h2, static_grad_i2h)) + np.testing.assert_array_equal(dy_out, static_out) + np.testing.assert_array_equal(dy_grad_h2o, static_grad_h2o) + np.testing.assert_array_equal(dy_grad_h2h, static_grad_h2h) + np.testing.assert_array_equal(dy_grad_i2h, static_grad_i2h) + np.testing.assert_array_equal(dy_out2, static_out) + np.testing.assert_array_equal(dy_grad_h2o2, static_grad_h2o) + np.testing.assert_array_equal(dy_grad_h2h2, static_grad_h2h) + np.testing.assert_array_equal(dy_grad_i2h2, static_grad_i2h) def test_rnn(self): with _test_eager_guard(): @@ -846,7 +846,7 @@ def func_append_activation_in_dygraph1(self): a = paddle.to_tensor(a_np) res1 = func(a, act="hard_sigmoid") res2 = fluid.layers.hard_sigmoid(a) - self.assertTrue(np.array_equal(res1.numpy(), res2.numpy())) + np.testing.assert_array_equal(res1.numpy(), res2.numpy()) def test_append_activation_in_dygraph1(self): with _test_eager_guard(): @@ -875,7 +875,7 @@ def func_append_activation_in_dygraph3(self): a = paddle.to_tensor(a_np) res1 = func(a, act="sigmoid", use_cudnn=True) res2 = fluid.layers.sigmoid(a) - self.assertTrue(np.array_equal(res1.numpy(), res2.numpy())) + np.testing.assert_array_equal(res1.numpy(), res2.numpy()) def test_append_activation_in_dygraph3(self): with _test_eager_guard(): @@ -892,7 +892,7 @@ def func_append_activation_in_dygraph_use_mkldnn(self): a = paddle.to_tensor(a_np) res1 = func(a) res2 = fluid.layers.relu(a) - self.assertTrue(np.array_equal(res1.numpy(), res2.numpy())) + np.testing.assert_array_equal(res1.numpy(), res2.numpy()) def test_append_activation_in_dygraph_use_mkldnn(self): with _test_eager_guard(): @@ -911,7 +911,7 @@ def func_append_activation_in_dygraph_global_use_mkldnn(self): finally: fluid.set_flags({'FLAGS_use_mkldnn': False}) res2 = fluid.layers.relu(a) - self.assertTrue(np.array_equal(res1.numpy(), res2.numpy())) + np.testing.assert_array_equal(res1.numpy(), res2.numpy()) def test_append_activation_in_dygraph_global_use_mkldnn(self): with _test_eager_guard(): @@ -937,7 +937,7 @@ def func_append_bias_in_dygraph(self): a = paddle.to_tensor(a_np) res1 = func(a, bias=a) res2 = paddle.add(a, a) - self.assertTrue(np.array_equal(res1.numpy(), res2.numpy())) + np.testing.assert_array_equal(res1.numpy(), res2.numpy()) def test_append_bias_in_dygraph(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py index 8e9c3c280f466..3e667563e3492 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_data_parallel.py @@ -58,8 +58,8 @@ def test_data_parallel_state_dict(self): for k, v in single_state.items(): self.assertTrue(k in parallel_state) - self.assertTrue( - np.array_equal(v.numpy(), parallel_state[k].numpy())) + np.testing.assert_array_equal(v.numpy(), + parallel_state[k].numpy()) base_para[k] = v.numpy() @@ -75,7 +75,7 @@ def test_data_parallel_state_dict(self): parallel_state = parallel_mlp.state_dict() for k, v in parallel_state.items(): - self.assertTrue(np.array_equal(v.numpy(), base_para[k])) + np.testing.assert_array_equal(v.numpy(), base_para[k]) parallel_mlp.load_dict(base_para) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py index d80b708ebf25c..5527ab2769165 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py @@ -186,9 +186,8 @@ def record_hook(grad): out4 = paddle.mean(out3) egr_dout2, egr_dout3 = paddle.grad([out4], [out2, out3]) - self.assertTrue( - np.array_equal(dout2_record_by_hook[0].numpy(), - np.array([1., 2.]))) + np.testing.assert_array_equal(dout2_record_by_hook[0].numpy(), + np.array([1.0, 2.0])) x1 = paddle.to_tensor([1.0, 2.0]) x1.stop_gradient = False @@ -203,8 +202,8 @@ def record_hook(grad): self.assertEqual(dout2.stop_gradient, egr_dout2.stop_gradient) self.assertEqual(dout3.stop_gradient, egr_dout3.stop_gradient) - self.assertTrue(np.array_equal(dout2.numpy(), egr_dout2.numpy())) - self.assertTrue(np.array_equal(dout3.numpy(), egr_dout3.numpy())) + np.testing.assert_array_equal(dout2.numpy(), egr_dout2.numpy()) + np.testing.assert_array_equal(dout3.numpy(), egr_dout3.numpy()) class TestDygraphDoubleGrad(TestCase): @@ -392,15 +391,13 @@ def func_none_one_initial_gradient(self): if grad_y is not None: self.assertTrue(grad_y.stop_gradient) - self.assertTrue( - np.array_equal(grad_y.numpy(), - original_random_grad_y)) + np.testing.assert_array_equal(grad_y.numpy(), + original_random_grad_y) if grad_z is not None: self.assertTrue(grad_z.stop_gradient) - self.assertTrue( - np.array_equal(grad_z.numpy(), - original_random_grad_z)) + np.testing.assert_array_equal(grad_z.numpy(), + original_random_grad_z) def test_none_one_initial_gradient(self): with _test_eager_guard(): @@ -583,7 +580,7 @@ def model_f(input): grad_2 = a.gradient() - self.assertTrue(np.array_equal(grad_1, grad_2)) + np.testing.assert_array_equal(grad_1, grad_2) def test_compare(self): with _test_eager_guard(): @@ -647,8 +644,8 @@ def test_resnet_resnet50(self): g_numpy = g.numpy() self.assertEqual(list(g_numpy.shape), list(out.shape)) - self.assertTrue(np.array_equal(egr_out, out)) - self.assertTrue(np.array_equal(egr_g_numpy, g_numpy)) + np.testing.assert_array_equal(egr_out, out) + np.testing.assert_array_equal(egr_g_numpy, g_numpy) @dygraph_guard def test_resnet_resnet101(self): @@ -679,8 +676,8 @@ def test_resnet_resnet101(self): g_numpy = g.numpy() self.assertEqual(list(g_numpy.shape), list(out.shape)) - self.assertTrue(np.array_equal(egr_out, out)) - self.assertTrue(np.array_equal(egr_g_numpy, g_numpy)) + np.testing.assert_array_equal(egr_out, out) + np.testing.assert_array_equal(egr_g_numpy, g_numpy) class TestDoubleGradBasics(TestCase): @@ -705,22 +702,22 @@ def test_matmul(self): new_x_g.backward() out_ref = np.ones([3, 3]) * 12.0 - self.assertTrue(np.array_equal(out.numpy(), out_ref)) + np.testing.assert_array_equal(out.numpy(), out_ref) new_x_g_ref = np.ones([3, 3]) * 6.0 new_y_g_ref = np.ones([3, 3]) * 6.0 - self.assertTrue(np.array_equal(new_x_g.numpy(), new_x_g_ref)) - self.assertTrue(np.array_equal(new_y_g.numpy(), new_y_g_ref)) + np.testing.assert_array_equal(new_x_g.numpy(), new_x_g_ref) + np.testing.assert_array_equal(new_y_g.numpy(), new_y_g_ref) x_grad_ref = np.ones([3, 3]) * 0.0 - self.assertTrue(np.array_equal(x.grad.numpy(), x_grad_ref)) + np.testing.assert_array_equal(x.grad.numpy(), x_grad_ref) y_grad_ref = np.ones([3, 3]) * 3.0 - self.assertTrue(np.array_equal(y.grad.numpy(), y_grad_ref)) + np.testing.assert_array_equal(y.grad.numpy(), y_grad_ref) grad_out_grad_ref = np.ones([3, 3]) * 6.0 - self.assertTrue( - np.array_equal(grad_out.grad.numpy(), grad_out_grad_ref)) + np.testing.assert_array_equal(grad_out.grad.numpy(), + grad_out_grad_ref) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py index 87d0d8e81b03c..a7e4af4165ba2 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py @@ -95,29 +95,27 @@ def func_forward_hook_return_value(self): forward_pre_hook_handle1 = simplenet.register_forward_pre_hook( forward_pre_hook1) outs_pre_hook = simplenet(input, y) - self.assertTrue( - np.array_equal(outs_pre_hook.numpy(), outs_origin1.numpy())) + np.testing.assert_array_equal(outs_pre_hook.numpy(), + outs_origin1.numpy()) # remove forward_pre_hook forward_pre_hook_handle1.remove() outs_pre_hook = simplenet(input, y) - self.assertTrue( - np.array_equal(outs_pre_hook.numpy(), outs_origin.numpy())) + np.testing.assert_array_equal(outs_pre_hook.numpy(), + outs_origin.numpy()) # register forward_posst_hook forward_post_hook_handle1 = simplenet.register_forward_post_hook( forward_post_hook1) outs_forward_hook = simplenet(input, y) - self.assertTrue( - np.array_equal(outs_forward_hook.numpy(), - outs_origin.numpy() * 2)) + np.testing.assert_array_equal(outs_forward_hook.numpy(), + outs_origin.numpy() * 2) # remove forward_post_hook forward_post_hook_handle1.remove() outs_forward_hook = simplenet(input, y) - self.assertTrue( - np.array_equal(outs_forward_hook.numpy(), - outs_origin.numpy())) + np.testing.assert_array_equal(outs_forward_hook.numpy(), + outs_origin.numpy()) # test forward_pre_hook and forward_post_hook that don't have return value def func_forward_hook(self): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py index 36bec7fb0301f..a1c9708923435 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py @@ -218,7 +218,7 @@ def __init__(self): my_test = MyTest() my_test.set_dict(new_dict, use_structured_name=False) for k, v in my_test.state_dict().items(): - self.assertTrue(np.array_equal(v.numpy(), new_dict[v.name])) + np.testing.assert_array_equal(v.numpy(), new_dict[v.name]) temp_dir.cleanup() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py index f9306d0cfebd7..8015fceff5d9c 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py @@ -200,10 +200,9 @@ def simple_net_float32(self, is_sparse, dtype): self.assertTrue( np.allclose(static_loss_value, dy_loss_value, rtol=1e-3)) for key, value in six.iteritems(static_param_init): - self.assertTrue(np.array_equal(value, dy_param_init[key])) + np.testing.assert_array_equal(value, dy_param_init[key]) for key, value in six.iteritems(static_param_updated): - self.assertTrue(np.array_equal(value, - dy_param_updated[key])) + np.testing.assert_array_equal(value, dy_param_updated[key]) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py index c0287668a3195..a93471a09c9c3 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py @@ -13,10 +13,11 @@ # limitations under the License. import unittest +import warnings + import numpy as np import paddle.fluid as fluid -import warnings -from paddle.fluid.framework import _test_eager_guard, _in_legacy_dygraph +from paddle.fluid.framework import _in_legacy_dygraph, _test_eager_guard class TestImperativeNumpyBridge(unittest.TestCase): @@ -31,14 +32,14 @@ def func_tensor_from_numpy(self): w[-1].message) # Temporally diable zero_copy # var = fluid.dygraph.to_variable(data_np, zero_copy=True) - # self.assertTrue(np.array_equal(var.numpy(), data_np)) + # np.testing.assert_array_equal(var.numpy(), data_np) # data_np[0][0] = 4 # self.assertEqual(data_np[0][0], 4) # self.assertEqual(var[0][0].numpy()[0], 4) - # self.assertTrue(np.array_equal(var.numpy(), data_np)) + # np.testing.assert_array_equal(var.numpy(), data_np) var2 = fluid.dygraph.to_variable(data_np, zero_copy=False) - self.assertTrue(np.array_equal(var2.numpy(), data_np)) + np.testing.assert_array_equal(var2.numpy(), data_np) data_np[0][0] = -1 self.assertEqual(data_np[0][0], -1) if not _in_legacy_dygraph(): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py index 064f0948cade5..21327255fb656 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py @@ -573,7 +573,7 @@ def run_dygraph(): self.assertTrue(np.allclose(static_out, dy_out)) for key, value in six.iteritems(static_param_init_value): - self.assertTrue(np.array_equal(value, dy_param_init_value[key])) + np.testing.assert_array_equal(value, dy_param_init_value[key]) for key, value in six.iteritems(static_param_value): self.assertTrue(np.allclose(value, dy_param_value[key], rtol=1e-05)) @@ -582,7 +582,7 @@ def run_dygraph(): self.assertTrue(np.allclose(static_out, eager_out)) for key, value in six.iteritems(static_param_init_value): - self.assertTrue(np.array_equal(value, eager_param_init_value[key])) + np.testing.assert_array_equal(value, eager_param_init_value[key]) for key, value in six.iteritems(static_param_value): self.assertTrue( diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py index e5e26111381a7..34f77b199f212 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py @@ -376,15 +376,15 @@ def ptb_rnn_cpu_float32(self, is_sparse): static_param_updated[static_param_name_list[k - 3]] = out[k] - self.assertTrue(np.array_equal(static_loss_value, dy_loss_value)) - self.assertTrue( - np.array_equal(static_last_cell_value, dy_last_cell_value)) - self.assertTrue( - np.array_equal(static_last_hidden_value, dy_last_hidden_value)) + np.testing.assert_array_equal(static_loss_value, dy_loss_value) + np.testing.assert_array_equal(static_last_cell_value, + dy_last_cell_value) + np.testing.assert_array_equal(static_last_hidden_value, + dy_last_hidden_value) for key, value in six.iteritems(static_param_init): - self.assertTrue(np.array_equal(value, dy_param_init[key])) + np.testing.assert_array_equal(value, dy_param_init[key]) for key, value in six.iteritems(static_param_updated): - self.assertTrue(np.array_equal(value, dy_param_updated[key])) + np.testing.assert_array_equal(value, dy_param_updated[key]) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py index 06bca877c8775..d0b12f33051ce 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py @@ -164,15 +164,15 @@ def ptb_rnn_sort_gradient_cpu_float32(self, is_sparse): static_param_updated[static_param_name_list[k - 3]] = out[k] - self.assertTrue(np.array_equal(static_loss_value, dy_loss_value)) - self.assertTrue( - np.array_equal(static_last_cell_value, dy_last_cell_value)) - self.assertTrue( - np.array_equal(static_last_hidden_value, dy_last_hidden_value)) + np.testing.assert_array_equal(static_loss_value, dy_loss_value) + np.testing.assert_array_equal(static_last_cell_value, + dy_last_cell_value) + np.testing.assert_array_equal(static_last_hidden_value, + dy_last_hidden_value) for key, value in six.iteritems(static_param_init): - self.assertTrue(np.array_equal(value, dy_param_init[key])) + np.testing.assert_array_equal(value, dy_param_init[key]) for key, value in six.iteritems(static_param_updated): - self.assertTrue(np.array_equal(value, dy_param_updated[key])) + np.testing.assert_array_equal(value, dy_param_updated[key]) def test_ptb_rnn_sort_gradient(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py index f59256f25f8ff..96a8e77f1a7c9 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py @@ -117,10 +117,10 @@ def test_recurrent_feed(self): static_dout = out[2] original_np1 = static_out_value - self.assertTrue(np.array_equal(static_sum_out, sum_out_value)) - self.assertTrue(np.array_equal(static_sum_out, eager_sum_out_value)) - self.assertTrue(np.array_equal(static_dout, dyout)) - self.assertTrue(np.array_equal(static_dout, eager_dyout)) + np.testing.assert_array_equal(static_sum_out, sum_out_value) + np.testing.assert_array_equal(static_sum_out, eager_sum_out_value) + np.testing.assert_array_equal(static_dout, dyout) + np.testing.assert_array_equal(static_dout, eager_dyout) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py index 593c046212276..7423b6ecfc4f0 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py @@ -387,8 +387,8 @@ def func_testLoadAndSetVarBase(self): opti_dict = adam.state_dict() for k, v in opti_dict.items(): if isinstance(v, (core.VarBase, core.eager.Tensor)): - self.assertTrue( - np.array_equal(v.numpy(), self.base_opti[v.name])) + np.testing.assert_array_equal(v.numpy(), + self.base_opti[v.name]) else: self.assertEqual(v, self.base_opti[k]) @@ -409,7 +409,7 @@ def func_testLoadAndSetVarBase(self): base_t = self.model_base[k] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) def func_testSetVariable(self): seed = 90 @@ -492,8 +492,8 @@ def func_testSetVariable(self): opti_dict = adam.state_dict() for k, v in opti_dict.items(): if isinstance(v, (core.VarBase, core.eager.Tensor)): - self.assertTrue( - np.array_equal(v.numpy(), self.base_opti[v.name])) + np.testing.assert_array_equal(v.numpy(), + self.base_opti[v.name]) else: self.assertEqual(v, self.base_opti[k]) @@ -514,7 +514,7 @@ def func_testSetVariable(self): base_t = self.model_base[k] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) def func_testSetNumpy(self): seed = 90 @@ -601,8 +601,8 @@ def func_testSetNumpy(self): opti_dict = adam.state_dict() for k, v in opti_dict.items(): if isinstance(v, (core.VarBase, core.eager.Tensor)): - self.assertTrue( - np.array_equal(v.numpy(), self.base_opti[v.name])) + np.testing.assert_array_equal(v.numpy(), + self.base_opti[v.name]) else: self.assertEqual(v, self.base_opti[k]) @@ -625,7 +625,7 @@ def func_testSetNumpy(self): base_t = self.model_base[k] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) def func_testSetVariableBeforeTrain(self): seed = 90 @@ -682,17 +682,15 @@ def func_testSetVariableBeforeTrain(self): opti_dict = adam.state_dict() for k, v in opti_dict.items(): if k == "global_step": - self.assertTrue( - np.array_equal(v.numpy(), self.base_opti[v.name] + 1)) + np.testing.assert_array_equal(v.numpy(), + self.base_opti[v.name] + 1) if k.find("beta1_pow_acc_0") > 0: - self.assertTrue( - np.array_equal(v.numpy(), - self.base_opti[v.name] * adam._beta1)) + np.testing.assert_array_equal( + v.numpy(), self.base_opti[v.name] * adam._beta1) if k.find("beta2_pow_acc_0") > 0: - self.assertTrue( - np.array_equal(v.numpy(), - self.base_opti[v.name] * adam._beta2)) + np.testing.assert_array_equal( + v.numpy(), self.base_opti[v.name] * adam._beta2) state_dict = ptb_model.state_dict() @@ -700,7 +698,7 @@ def func_testSetVariableBeforeTrain(self): new_t = v.numpy() base_t = self.model_base[k] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) def func_testLoadAndSetVarBaseBeforeTrain(self): seed = 90 @@ -769,17 +767,15 @@ def func_testLoadAndSetVarBaseBeforeTrain(self): opti_dict = adam.state_dict() for k, v in opti_dict.items(): if k == "global_step": - self.assertTrue( - np.array_equal(v.numpy(), self.base_opti[v.name] + 1)) + np.testing.assert_array_equal(v.numpy(), + self.base_opti[v.name] + 1) if k.find("beta1_pow_acc_0") > 0: - self.assertTrue( - np.array_equal(v.numpy(), - self.base_opti[v.name] * adam._beta1)) + np.testing.assert_array_equal( + v.numpy(), self.base_opti[v.name] * adam._beta1) if k.find("beta2_pow_acc_0") > 0: - self.assertTrue( - np.array_equal(v.numpy(), - self.base_opti[v.name] * adam._beta2)) + np.testing.assert_array_equal( + v.numpy(), self.base_opti[v.name] * adam._beta2) # check parameter @@ -789,7 +785,7 @@ def func_testLoadAndSetVarBaseBeforeTrain(self): new_t = v.numpy() base_t = self.model_base[k] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) def func_testSetNumpyBeforeTrain(self): seed = 90 @@ -870,17 +866,15 @@ def func_testSetNumpyBeforeTrain(self): opti_dict = adam.state_dict() for k, v in opti_dict.items(): if k == "global_step": - self.assertTrue( - np.array_equal(v.numpy(), self.base_opti[v.name] + 1)) + np.testing.assert_array_equal(v.numpy(), + self.base_opti[v.name] + 1) if k.find("beta1_pow_acc_0") > 0: - self.assertTrue( - np.array_equal(v.numpy(), - self.base_opti[v.name] * adam._beta1)) + np.testing.assert_array_equal( + v.numpy(), self.base_opti[v.name] * adam._beta1) if k.find("beta2_pow_acc_0") > 0: - self.assertTrue( - np.array_equal(v.numpy(), - self.base_opti[v.name] * adam._beta2)) + np.testing.assert_array_equal( + v.numpy(), self.base_opti[v.name] * adam._beta2) # check parameter @@ -890,7 +884,7 @@ def func_testSetNumpyBeforeTrain(self): new_t = v.numpy() base_t = self.model_base[k] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) def func_testOnlyLoadParams(self): with fluid.dygraph.guard(): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py index f0026f8ef3307..5c67f5085d2e2 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py @@ -401,8 +401,8 @@ def func_testLoadAndSetVarBase(self): opti_dict = adam.state_dict() for k, v in opti_dict.items(): if isinstance(v, (core.VarBase, core.eager.Tensor)): - self.assertTrue( - np.array_equal(v.numpy(), self.base_opti[v.name])) + np.testing.assert_array_equal(v.numpy(), + self.base_opti[v.name]) else: self.assertEqual(v, self.base_opti[k]) @@ -423,7 +423,7 @@ def func_testLoadAndSetVarBase(self): base_t = self.model_base[k] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) def func_testSetVariable(self): seed = 90 @@ -508,8 +508,8 @@ def func_testSetVariable(self): opti_dict = adam.state_dict() for k, v in opti_dict.items(): if isinstance(v, (core.VarBase, core.eager.Tensor)): - self.assertTrue( - np.array_equal(v.numpy(), self.base_opti[v.name])) + np.testing.assert_array_equal(v.numpy(), + self.base_opti[v.name]) else: self.assertEqual(v, self.base_opti[k]) @@ -530,7 +530,7 @@ def func_testSetVariable(self): base_t = self.model_base[k] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) def func_testSetNumpy(self): seed = 90 @@ -619,8 +619,8 @@ def func_testSetNumpy(self): opti_dict = adam.state_dict() for k, v in opti_dict.items(): if isinstance(v, (core.VarBase, core.eager.Tensor)): - self.assertTrue( - np.array_equal(v.numpy(), self.base_opti[v.name])) + np.testing.assert_array_equal(v.numpy(), + self.base_opti[v.name]) else: self.assertEqual(v, self.base_opti[k]) @@ -643,7 +643,7 @@ def func_testSetNumpy(self): base_t = self.model_base[k] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) def func_testSetVariableBeforeTrain(self): seed = 90 @@ -702,17 +702,15 @@ def func_testSetVariableBeforeTrain(self): opti_dict = adam.state_dict() for k, v in opti_dict.items(): if k == "global_step": - self.assertTrue( - np.array_equal(v.numpy(), self.base_opti[v.name] + 1)) + np.testing.assert_array_equal(v.numpy(), + self.base_opti[v.name] + 1) if k.find("beta1_pow_acc_0") > 0: - self.assertTrue( - np.array_equal(v.numpy(), - self.base_opti[v.name] * adam._beta1)) + np.testing.assert_array_equal( + v.numpy(), self.base_opti[v.name] * adam._beta1) if k.find("beta2_pow_acc_0") > 0: - self.assertTrue( - np.array_equal(v.numpy(), - self.base_opti[v.name] * adam._beta2)) + np.testing.assert_array_equal( + v.numpy(), self.base_opti[v.name] * adam._beta2) state_dict = ptb_model.state_dict() @@ -720,7 +718,7 @@ def func_testSetVariableBeforeTrain(self): new_t = v.numpy() base_t = self.model_base[k] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) def func_testLoadAndSetVarBaseBeforeTrain(self): seed = 90 @@ -790,17 +788,15 @@ def func_testLoadAndSetVarBaseBeforeTrain(self): opti_dict = adam.state_dict() for k, v in opti_dict.items(): if k == "global_step": - self.assertTrue( - np.array_equal(v.numpy(), self.base_opti[v.name] + 1)) + np.testing.assert_array_equal(v.numpy(), + self.base_opti[v.name] + 1) if k.find("beta1_pow_acc_0") > 0: - self.assertTrue( - np.array_equal(v.numpy(), - self.base_opti[v.name] * adam._beta1)) + np.testing.assert_array_equal( + v.numpy(), self.base_opti[v.name] * adam._beta1) if k.find("beta2_pow_acc_0") > 0: - self.assertTrue( - np.array_equal(v.numpy(), - self.base_opti[v.name] * adam._beta2)) + np.testing.assert_array_equal( + v.numpy(), self.base_opti[v.name] * adam._beta2) # check parameter @@ -810,7 +806,7 @@ def func_testLoadAndSetVarBaseBeforeTrain(self): new_t = v.numpy() base_t = self.model_base[k] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) def func_testSetNumpyBeforeTrain(self): seed = 90 @@ -892,18 +888,15 @@ def func_testSetNumpyBeforeTrain(self): opti_dict = adam.state_dict() for k, v in opti_dict.items(): if k == "LR_Scheduler": - self.assertTrue( - np.array_equal(v['last_epoch'], - self.base_opti[k]['last_epoch'] + 1)) + np.testing.assert_array_equal( + v['last_epoch'], self.base_opti[k]['last_epoch'] + 1) if k.find("beta1_pow_acc_0") > 0: - self.assertTrue( - np.array_equal(v.numpy(), - self.base_opti[v.name] * adam._beta1)) + np.testing.assert_array_equal( + v.numpy(), self.base_opti[v.name] * adam._beta1) if k.find("beta2_pow_acc_0") > 0: - self.assertTrue( - np.array_equal(v.numpy(), - self.base_opti[v.name] * adam._beta2)) + np.testing.assert_array_equal( + v.numpy(), self.base_opti[v.name] * adam._beta2) # check parameter @@ -913,7 +906,7 @@ def func_testSetNumpyBeforeTrain(self): new_t = v.numpy() base_t = self.model_base[k] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) def func_testOnlyLoadParams(self): with fluid.dygraph.guard(): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py index 9f01315720500..8268e52127ecd 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py @@ -203,13 +203,11 @@ def simple_net_float(self, is_sparse, dtype): static_param_updated[static_param_name_list[ k - 1]] = out[k] - self.assertTrue(np.array_equal(static_loss_value, - dy_loss_value)) + np.testing.assert_array_equal(static_loss_value, dy_loss_value) for key, value in six.iteritems(static_param_init): - self.assertTrue(np.array_equal(value, dy_param_init[key])) + np.testing.assert_array_equal(value, dy_param_init[key]) for key, value in six.iteritems(static_param_updated): - self.assertTrue(np.array_equal(value, - dy_param_updated[key])) + np.testing.assert_array_equal(value, dy_param_updated[key]) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py index d031cd84683da..027bb2b9173f3 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py @@ -313,11 +313,11 @@ def test_mnist_train_no_params_filename(self): self.load_and_train_static() # Phase 3. compare - self.assertTrue(np.array_equal(static_x_data, dy_x_data)) + np.testing.assert_array_equal(static_x_data, dy_x_data) for key, value in six.iteritems(static_param_init_value): key = dict_old_new_init[key] - self.assertTrue(np.array_equal(value, dy_param_init_value[key])) + np.testing.assert_array_equal(value, dy_param_init_value[key]) # np.testing.assert_array_almost_equal(static_out, dy_out) self.assertTrue(np.allclose(static_out, dy_out, atol=1e-04)) @@ -341,10 +341,10 @@ def test_mnist_train_with_params_filename(self): self.load_and_train_static() # Phase 3. compare - self.assertTrue(np.array_equal(static_x_data, dy_x_data)) + np.testing.assert_array_equal(static_x_data, dy_x_data) for key, value in six.iteritems(static_param_init_value): key = dict_old_new_init[key] - self.assertTrue(np.array_equal(value, dy_param_init_value[key])) + np.testing.assert_array_equal(value, dy_param_init_value[key]) # np.testing.assert_array_almost_equal(static_out, dy_out) self.assertTrue(np.allclose(static_out, dy_out, atol=1e-04)) @@ -368,7 +368,7 @@ def test_mnist_infer_no_params_filename(self): self.load_and_infer_static() # Phase 3. compare - self.assertTrue(np.array_equal(static_x_data, dy_x_data)) + np.testing.assert_array_equal(static_x_data, dy_x_data) np.testing.assert_array_almost_equal(static_out, dy_out) self.assertTrue(np.allclose(static_out, dy_out, atol=1e-04)) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py index 0c4dad64adaea..1f7ac043d056d 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py @@ -232,7 +232,7 @@ def test_while_no_params_filename(self): static_param_init_value.keys()) for key, value in six.iteritems(static_param_init_value): key = dict_old_new_init[key] - self.assertTrue(np.array_equal(value, dy_param_init_value[key])) + np.testing.assert_array_equal(value, dy_param_init_value[key]) self.assertTrue(np.allclose(static_out, dy_out)) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py b/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py index 8a7fa967897eb..db655e4b4e6d1 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py @@ -62,7 +62,7 @@ def test_main(self): dygraph_out = layer(in_x) dygraph_out_numpy = dygraph_out.numpy() static_out = traced_layer([in_x])[0] - self.assertTrue(np.array_equal(dygraph_out_numpy, static_out)) + np.testing.assert_array_equal(dygraph_out_numpy, static_out) loss = fluid.layers.reduce_mean(dygraph_out) loss.backward() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py index 7f60d6c64acb7..732de03ff0e6b 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py @@ -1133,19 +1133,19 @@ def run_dygraph(): static_param_updated[static_param_name_list[k - 4]] = out[k] if _in_legacy_dygraph(): - self.assertTrue( - np.array_equal(static_avg_cost_value, dy_avg_cost_value)) - self.assertTrue( - np.array_equal(static_sum_cost_value, dy_sum_cost_value)) - self.assertTrue( - np.array_equal(static_predict_value, dy_predict_value)) - self.assertTrue( - np.array_equal(static_token_num_value, dy_token_num_value)) + np.testing.assert_array_equal(static_avg_cost_value, + dy_avg_cost_value) + np.testing.assert_array_equal(static_sum_cost_value, + dy_sum_cost_value) + np.testing.assert_array_equal(static_predict_value, + dy_predict_value) + np.testing.assert_array_equal(static_token_num_value, + dy_token_num_value) for key, value in six.iteritems(static_param_init): - self.assertTrue(np.array_equal(value, dy_param_init[key])) + np.testing.assert_array_equal(value, dy_param_init[key]) for key, value in six.iteritems(static_param_updated): - self.assertTrue(np.array_equal(value, dy_param_updated[key])) + np.testing.assert_array_equal(value, dy_param_updated[key]) # compare eager result with imperative result with guard(): @@ -1164,7 +1164,7 @@ def run_dygraph(): self.assertTrue(np.allclose(dy_token_num_value, eager_token_num_value)) for key, value in six.iteritems(static_param_init): - self.assertTrue(np.array_equal(value, eager_param_init[key])) + np.testing.assert_array_equal(value, eager_param_init[key]) for key, value in six.iteritems(dy_param_updated): self.assertTrue(np.allclose(value, eager_param_updated[key])) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py index b814ca87dcd76..d3f2009e69d75 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py @@ -78,37 +78,34 @@ def test_matmul_triple_grad(self): new_a.backward() out_ref = np.ones([3, 3]) * 12.0 - self.assertTrue(np.array_equal(out.numpy(), out_ref)) + np.testing.assert_array_equal(out.numpy(), out_ref) new_x_g_ref = np.ones([3, 3]) * 6.0 new_y_g_ref = np.ones([3, 3]) * 6.0 - self.assertTrue(np.array_equal(new_x_g.numpy(), new_x_g_ref)) - self.assertTrue(np.array_equal(new_y_g.numpy(), new_y_g_ref)) + np.testing.assert_array_equal(new_x_g.numpy(), new_x_g_ref) + np.testing.assert_array_equal(new_y_g.numpy(), new_y_g_ref) new_a_ref = np.ones([3, 3]) * 3.0 new_b_ref = np.ones([3, 3]) * 3.0 new_c_ref = np.ones([3, 3]) * 12.0 - self.assertTrue(np.array_equal(new_a.numpy(), new_a_ref)) - self.assertTrue(np.array_equal(new_b.numpy(), new_b_ref)) - self.assertTrue(np.array_equal(new_c.numpy(), new_c_ref)) + np.testing.assert_array_equal(new_a.numpy(), new_a_ref) + np.testing.assert_array_equal(new_b.numpy(), new_b_ref) + np.testing.assert_array_equal(new_c.numpy(), new_c_ref) x_grad_ref = np.ones([3, 3]) * 0.0 - self.assertTrue(np.array_equal(x.grad.numpy(), x_grad_ref)) + np.testing.assert_array_equal(x.grad.numpy(), x_grad_ref) y_grad_ref = np.ones([3, 3]) * 0.0 - self.assertTrue(np.array_equal(y.grad.numpy(), y_grad_ref)) + np.testing.assert_array_equal(y.grad.numpy(), y_grad_ref) new_out_g_ref = np.ones([3, 3]) * 3.0 - self.assertTrue( - np.array_equal(new_out_g.grad.numpy(), new_out_g_ref)) + np.testing.assert_array_equal(new_out_g.grad.numpy(), new_out_g_ref) new_x_g_g_ref = np.ones([3, 3]) * 0.0 new_y_g_g_ref = np.ones([3, 3]) * 3.0 - self.assertTrue( - np.array_equal(new_x_g_g.grad.numpy(), new_x_g_g_ref)) - self.assertTrue( - np.array_equal(new_y_g_g.grad.numpy(), new_y_g_g_ref)) + np.testing.assert_array_equal(new_x_g_g.grad.numpy(), new_x_g_g_ref) + np.testing.assert_array_equal(new_y_g_g.grad.numpy(), new_y_g_g_ref) class TestDygraphTripleGrad(TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py b/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py index 84180fa299bdb..2cc157ae050cf 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py @@ -25,7 +25,7 @@ class TestImperativeUsingNonZeroGpu(unittest.TestCase): def run_main(self, np_arr, place): with guard(place): var = to_variable(np_arr) - self.assertTrue(np.array_equal(np_arr, var.numpy())) + np.testing.assert_array_equal(np_arr, var.numpy()) def func_non_zero_gpu(self): if not fluid.is_compiled_with_cuda(): diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py index 7138c2393ffca..df9236d245bb8 100644 --- a/python/paddle/fluid/tests/unittests/test_initializer.py +++ b/python/paddle/fluid/tests/unittests/test_initializer.py @@ -795,8 +795,8 @@ def run_static_graph(): dynamic_res = run_dynamic_graph() static_res = run_static_graph() - self.assertTrue(np.array_equal(dynamic_res[0], static_res[0])) - self.assertTrue(np.array_equal(dynamic_res[1], static_res[1])) + np.testing.assert_array_equal(dynamic_res[0], static_res[0]) + np.testing.assert_array_equal(dynamic_res[1], static_res[1]) def test_order(self): with framework._test_eager_guard(): @@ -819,7 +819,7 @@ def config(self): self.num_ops = 9 def check_result(self, a, b): - self.assertTrue(np.array_equal(a, b)) + np.testing.assert_array_equal(a, b) self.assertTrue(np.allclose(np.matmul(a, a.T), 9 * np.eye(10))) def func_orthogonal(self): @@ -878,7 +878,7 @@ def config(self): self.num_ops = 8 def check_result(self, a, b): - self.assertTrue(np.array_equal(a, b)) + np.testing.assert_array_equal(a, b) self.assertTrue(np.allclose(np.matmul(a.T, a), 4 * np.eye(10))) @@ -897,7 +897,7 @@ def config(self): self.num_ops = 8 def check_result(self, a, b): - self.assertTrue(np.array_equal(a, b)) + np.testing.assert_array_equal(a, b) self.assertTrue(np.allclose(np.matmul(a.T, a), np.eye(10), atol=1.e-6)) self.assertTrue(np.allclose(np.matmul(a, a.T), np.eye(10), atol=1.e-6)) @@ -922,7 +922,7 @@ def config(self): self.kernel_size = (3, 3) def check_result(self, a, b): - self.assertTrue(np.array_equal(a, b)) + np.testing.assert_array_equal(a, b) a = a.reshape(6, -1) self.assertTrue(np.allclose(np.matmul(a, a.T), 9 * np.eye(6))) @@ -973,7 +973,7 @@ def config(self): self.kernel_size = (3, 3) def check_result(self, a, b): - self.assertTrue(np.array_equal(a, b)) + np.testing.assert_array_equal(a, b) a = a.reshape(50, -1) self.assertTrue(np.allclose(np.matmul(a.T, a), 4 * np.eye(36))) @@ -993,7 +993,7 @@ def config(self): self.kernel_size = (3, 3) def check_result(self, a, b): - self.assertTrue(np.array_equal(a, b)) + np.testing.assert_array_equal(a, b) a = a.reshape(36, -1) self.assertTrue(np.allclose(np.matmul(a.T, a), np.eye(36), atol=1.e-6)) self.assertTrue(np.allclose(np.matmul(a, a.T), np.eye(36), atol=1.e-6)) @@ -1014,8 +1014,8 @@ def config(self): self.num_ops = 8 #fill_constant*2, reshape*2, assign_value*2, scatter, cast def check_result(self, w_dygraph, w_static, conv_in, conv_out): - self.assertTrue(np.array_equal(w_dygraph, w_static)) - self.assertTrue(np.array_equal(conv_out, conv_in[:, 0:2, 1:9])) + np.testing.assert_array_equal(w_dygraph, w_static) + np.testing.assert_array_equal(conv_out, conv_in[:, 0:2, 1:9]) def func_dirac(self): self.config() @@ -1079,11 +1079,11 @@ def config(self): self.num_ops = 8 def check_result(self, w_dygraph, w_static, conv_in, conv_out): - self.assertTrue(np.array_equal(w_dygraph, w_static)) - self.assertTrue( - np.array_equal(conv_out[:, 0:4, :, :], conv_in[:, :, 1:9, 1:9])) - self.assertTrue( - np.array_equal(conv_out[:, 4:8, :, :], np.zeros([8, 4, 8, 8]))) + np.testing.assert_array_equal(w_dygraph, w_static) + np.testing.assert_array_equal(conv_out[:, 0:4, :, :], conv_in[:, :, 1:9, + 1:9]) + np.testing.assert_array_equal(conv_out[:, 4:8, :, :], + np.zeros([8, 4, 8, 8])) # initialize Conv3D weight @@ -1101,13 +1101,11 @@ def config(self): self.num_ops = 7 def check_result(self, w_dygraph, w_static, conv_in, conv_out): - self.assertTrue(np.array_equal(w_dygraph, w_static)) - self.assertTrue( - np.array_equal(conv_out[:, 0:5, :, :, :], conv_in[:, :, 1:9, 1:9, - 1:9])) - self.assertTrue( - np.array_equal(conv_out[:, 5:10, :, :, :], conv_in[:, :, 1:9, 1:9, - 1:9])) + np.testing.assert_array_equal(w_dygraph, w_static) + np.testing.assert_array_equal(conv_out[:, 0:5, :, :, :], + conv_in[:, :, 1:9, 1:9, 1:9]) + np.testing.assert_array_equal(conv_out[:, 5:10, :, :, :], + conv_in[:, :, 1:9, 1:9, 1:9]) def test_error(self): self.config() diff --git a/python/paddle/fluid/tests/unittests/test_inplace.py b/python/paddle/fluid/tests/unittests/test_inplace.py index b81fcd90746d1..94e30a5e8a163 100644 --- a/python/paddle/fluid/tests/unittests/test_inplace.py +++ b/python/paddle/fluid/tests/unittests/test_inplace.py @@ -142,7 +142,7 @@ def func_test_inplace_api(self): self.assertTrue(id(var) == id(inplace_var)) inplace_var[0] = 2. - self.assertTrue(np.array_equal(var.numpy(), inplace_var.numpy())) + np.testing.assert_array_equal(var.numpy(), inplace_var.numpy()) def test_inplace_api(self): with _test_eager_guard(): @@ -276,7 +276,7 @@ def func_test_backward_success_2(self): loss.backward() grad_var_a = var_a.grad.numpy() - self.assertTrue(np.array_equal(grad_var_a_inplace, grad_var_a)) + np.testing.assert_array_equal(grad_var_a_inplace, grad_var_a) def test_backward_success_2(self): with _test_eager_guard(): @@ -506,7 +506,7 @@ def func_test_loss_is_inplace_var(self): loss.backward() grad_var_a = var_a.grad.numpy() - self.assertTrue(np.array_equal(inplace_grad_var_a, grad_var_a)) + np.testing.assert_array_equal(inplace_grad_var_a, grad_var_a) def test_loss_is_inplace_var(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py index 39e493b1b344d..73305368d4cf7 100644 --- a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py +++ b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py @@ -108,7 +108,7 @@ def run_program(enable_addto): res1, w1 = run_program(True) res2, w2 = run_program(False) - self.assertTrue(np.array_equal(res1, res2)) + np.testing.assert_array_equal(res1, res2) def test_nchw(self): self.check_result() diff --git a/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py b/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py index 581ce0d5d0259..2fc112870c685 100644 --- a/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py +++ b/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py @@ -56,7 +56,7 @@ def test_api(self): feed={"x": self.np_x}, fetch_list=[x, out]) - self.assertTrue(np.array_equal(fetch_x, self.np_x)) + np.testing.assert_array_equal(fetch_x, self.np_x) self.assertTrue( self.np_compare(fetch_out, self.executed_numpy_api(self.np_x))) diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py index 6aef26ac65ba0..ab6dea3940e80 100644 --- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py @@ -388,9 +388,9 @@ def load_and_inference(self, train_layer, infer_layer): # inference & compare x = fluid.dygraph.to_variable( np.random.random((1, 784)).astype('float32')) - self.assertTrue( - np.array_equal(train_layer(x).numpy(), - infer_layer(x).numpy())) + np.testing.assert_array_equal( + train_layer(x).numpy(), + infer_layer(x).numpy()) def load_and_finetune(self, train_layer, load_train_layer): train_layer.train() @@ -398,8 +398,8 @@ def load_and_finetune(self, train_layer, load_train_layer): # train & compare img0, _, train_loss = train(train_layer) img1, _, load_train_loss = train(load_train_layer) - self.assertTrue( - np.array_equal(train_loss.numpy(), load_train_loss.numpy())) + np.testing.assert_array_equal(train_loss.numpy(), + load_train_loss.numpy()) def load_dygraph_state_dict(self, train_layer): train_layer.eval() @@ -414,9 +414,9 @@ def load_dygraph_state_dict(self, train_layer): # inference & compare x = fluid.dygraph.to_variable( np.random.random((1, 784)).astype('float32')) - self.assertTrue( - np.array_equal(train_layer(x).numpy(), - new_layer(x).numpy())) + np.testing.assert_array_equal( + train_layer(x).numpy(), + new_layer(x).numpy()) def test_load_dygraph_no_path(self): model_path = os.path.join(self.temp_dir.name, @@ -673,9 +673,9 @@ def test_output_spec(self): infer_layer = paddle.jit.load(model_path) x = fluid.dygraph.to_variable( np.random.random((4, 8)).astype('float32')) - self.assertTrue( - np.array_equal(train_layer(x)[0].numpy(), - infer_layer(x).numpy())) + np.testing.assert_array_equal( + train_layer(x)[0].numpy(), + infer_layer(x).numpy()) def test_save_no_support_config_error(self): layer = LinearNet(784, 1) @@ -778,9 +778,9 @@ def test_load_pruned_model(self): x = fluid.dygraph.to_variable( np.random.random((4, 8)).astype('float32')) - self.assertTrue( - np.array_equal(train_layer(x)[0].numpy(), - infer_layer(x).numpy())) + np.testing.assert_array_equal( + train_layer(x)[0].numpy(), + infer_layer(x).numpy()) def test_load_var_not_in_extra_var_info(self): self.train_and_save() @@ -831,10 +831,12 @@ def verify_inference_correctness(self, else: pred = layer(x).numpy() loaded_pred = loaded_layer(x).numpy() - self.assertTrue( - np.array_equal(pred, loaded_pred), - msg="Result diff when load and inference:\nlayer result:\n{}\n" \ - "loaded layer result:\n{}".format(pred, loaded_pred)) + np.testing.assert_array_equal( + pred, + loaded_pred, + err_msg= + 'Result diff when load and inference:\nlayer result:\n{}\nloaded layer result:\n{}' + .format(pred, loaded_pred)) def test_no_prune_to_static_after_train(self): layer = LinearNet(784, 1) @@ -1056,7 +1058,7 @@ def test_save_load_empty_layer(self): paddle.jit.save(layer, self.model_path) load_layer = paddle.jit.load(self.model_path) load_out = load_layer(x) - self.assertTrue(np.array_equal(out, load_out)) + np.testing.assert_array_equal(out, load_out) class TestJitSaveLoadNoParamLayer(unittest.TestCase): @@ -1079,7 +1081,7 @@ def test_save_load_no_param_layer(self): paddle.jit.save(layer, self.model_path) load_layer = paddle.jit.load(self.model_path) load_out = load_layer(x, y) - self.assertTrue(np.array_equal(out, load_out)) + np.testing.assert_array_equal(out, load_out) class TestJitSaveLoadMultiMethods(unittest.TestCase): @@ -1506,7 +1508,7 @@ def anothor_forward(self, x): load_func = paddle.jit.load(path) load_result = load_func(inps) - self.assertTrue(np.array_equal(load_result.numpy(), origin.numpy())) + np.testing.assert_array_equal(load_result.numpy(), origin.numpy()) class TestJitSaveLoadFunctionWithParamCase2(unittest.TestCase): @@ -1546,8 +1548,8 @@ def anothor_forward(self, x): load_result = load_func(inps) - self.assertTrue( - np.array_equal(origin_result.numpy(), load_result.numpy())) + np.testing.assert_array_equal(origin_result.numpy(), + load_result.numpy()) class TestJitSaveLoadFunctionWithParamCase3(unittest.TestCase): @@ -1586,7 +1588,7 @@ def anothor_forward(self, x): load_func = paddle.jit.load(path) load_result = load_func(inps) - self.assertTrue(np.array_equal(load_result.numpy(), origin.numpy())) + np.testing.assert_array_equal(load_result.numpy(), origin.numpy()) class TestJitSaveLoadDataParallel(unittest.TestCase): @@ -1605,10 +1607,12 @@ def verify_inference_correctness(self, layer, path): x = paddle.to_tensor(np.random.random((1, 784)).astype('float32')) pred = layer(x).numpy() loaded_pred = loaded_layer(x).numpy() - self.assertTrue( - np.array_equal(pred, loaded_pred), - msg="Result diff when load and inference:\nlayer result:\n{}\n" \ - "loaded layer result:\n{}".format(pred, loaded_pred)) + np.testing.assert_array_equal( + pred, + loaded_pred, + err_msg= + 'Result diff when load and inference:\nlayer result:\n{}\nloaded layer result:\n{}' + .format(pred, loaded_pred)) def test_jit_save_data_parallel_with_inputspec(self): layer = LinearNetNotDeclarative(784, 1) diff --git a/python/paddle/fluid/tests/unittests/test_lambv2_op.py b/python/paddle/fluid/tests/unittests/test_lambv2_op.py index 6ae2dbfb590bd..54f84a1bb9bd1 100644 --- a/python/paddle/fluid/tests/unittests/test_lambv2_op.py +++ b/python/paddle/fluid/tests/unittests/test_lambv2_op.py @@ -237,8 +237,8 @@ def get_parameter(var): if multi_precision: params[0] = np.array(params[0]) params[1] = np.array(params[1]) - self.assertTrue( - np.array_equal(params[0], params[1].astype(np.float16))) + np.testing.assert_array_equal(params[0], + params[1].astype(np.float16)) return params[0].astype(np.float32) else: self.assertTrue(params[0] is not None) @@ -259,9 +259,8 @@ def get_parameter(var): fetch_list=[weight, bias]) weight_np = weight_np.astype('float32') bias_np = bias_np.astype('float32') - self.assertTrue(np.array_equal(weight_np, - get_parameter(weight))) - self.assertTrue(np.array_equal(bias_np, get_parameter(bias))) + np.testing.assert_array_equal(weight_np, get_parameter(weight)) + np.testing.assert_array_equal(bias_np, get_parameter(bias)) return weight_np, bias_np @switch_to_static_graph diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py index 2ee1a1ba76f7b..1bae5b75210b4 100644 --- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py @@ -381,7 +381,7 @@ def test_main(self): x_np, weight_np, bias_np, 'float32') def assert_equal(x, y): - self.assertTrue(np.array_equal(x, y)) + np.testing.assert_array_equal(x, y) assert_equal(y_np_1, y_np_2) assert_equal(x_g_np_1, x_g_np_2) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 551ba3ffb542b..20bc86646f74c 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -109,16 +109,16 @@ def forward(self, x, do_linear2=False): x = base.to_variable(inp) custom = CustomLayer(input_size=3, linear1_size=2) ret = custom(x, do_linear2=False) - self.assertTrue(np.array_equal(ret.numpy().shape, [3, 2])) + np.testing.assert_array_equal(ret.numpy().shape, [3, 2]) ret = custom(x, do_linear2=True) - self.assertTrue(np.array_equal(ret.numpy().shape, [3, 1])) + np.testing.assert_array_equal(ret.numpy().shape, [3, 1]) inp = np.ones([3, 3], dtype='float32') x = base.to_variable(inp) custom = CustomLayer(input_size=3, linear1_size=2) ret = custom(x, do_linear2=False) - self.assertTrue(np.array_equal(ret.numpy().shape, [3, 2])) + np.testing.assert_array_equal(ret.numpy().shape, [3, 2]) ret = custom(x, do_linear2=True) - self.assertTrue(np.array_equal(ret.numpy().shape, [3, 1])) + np.testing.assert_array_equal(ret.numpy().shape, [3, 1]) def test_dropout(self): inp = np.ones([3, 32, 32], dtype='float32') @@ -157,12 +157,12 @@ def test_dropout(self): dy_ret_value = dy_ret.numpy() dy_ret2_value = dy_ret2.numpy() - self.assertTrue(np.array_equal(dy_eager_ret_value, dy_eager_ret2_value)) - self.assertTrue(np.array_equal(static_ret, dy_eager_ret_value)) + np.testing.assert_array_equal(dy_eager_ret_value, dy_eager_ret2_value) + np.testing.assert_array_equal(static_ret, dy_eager_ret_value) - self.assertTrue(np.array_equal(static_ret, static_ret2)) - self.assertTrue(np.array_equal(dy_ret_value, dy_ret2_value)) - self.assertTrue(np.array_equal(static_ret, dy_ret_value)) + np.testing.assert_array_equal(static_ret, static_ret2) + np.testing.assert_array_equal(dy_ret_value, dy_ret2_value) + np.testing.assert_array_equal(static_ret, dy_ret_value) def test_linear(self): inp = np.ones([3, 32, 32], dtype='float32') @@ -192,8 +192,8 @@ def test_linear(self): dy_ret = linear(t) dy_ret_value = dy_ret.numpy() - self.assertTrue(np.array_equal(static_ret, dy_eager_ret_value)) - self.assertTrue(np.array_equal(static_ret, dy_ret_value)) + np.testing.assert_array_equal(static_ret, dy_eager_ret_value) + np.testing.assert_array_equal(static_ret, dy_ret_value) with self.static_graph(): @@ -243,8 +243,8 @@ def test_Flatten(self): dy_ret = flatten(t) dy_ret_value = dy_ret.numpy() - self.assertTrue(np.array_equal(static_ret, dy_eager_ret_value)) - self.assertTrue(np.array_equal(static_ret, dy_ret_value)) + np.testing.assert_array_equal(static_ret, dy_eager_ret_value) + np.testing.assert_array_equal(static_ret, dy_ret_value) with self.static_graph(): @@ -338,9 +338,9 @@ def test_layer_norm(self): self.assertFalse(hasattr(lm, "_scale_w")) self.assertFalse(hasattr(lm, "_bias_w")) - self.assertTrue(np.array_equal(static_ret, static_ret2)) - self.assertTrue(np.array_equal(dy_eager_ret_value, static_ret2)) - self.assertTrue(np.array_equal(dy_ret_value, static_ret2)) + np.testing.assert_array_equal(static_ret, static_ret2) + np.testing.assert_array_equal(dy_eager_ret_value, static_ret2) + np.testing.assert_array_equal(dy_ret_value, static_ret2) with self.dynamic_graph(): with _test_eager_guard(): @@ -379,8 +379,8 @@ def test_SyncBatchNorm(self): my_syncbn = paddle.nn.SyncBatchNorm(3) dy_ret = my_syncbn(base.to_variable(t)) dy_ret_value = dy_ret.numpy() - self.assertTrue(np.array_equal(static_ret, dy_ret_value)) - self.assertTrue(np.array_equal(static_ret, dy_eager_ret_value)) + np.testing.assert_array_equal(static_ret, dy_ret_value) + np.testing.assert_array_equal(static_ret, dy_eager_ret_value) def test_relu(self): with self.static_graph(): @@ -536,21 +536,19 @@ def test_type(): self.assertFalse( np.array_equal(conv2d1_weight_np, conv2d2.weight.numpy())) conv2d2.weight.set_value(conv2d1_weight_np) - self.assertTrue( - np.array_equal(conv2d1_weight_np, conv2d2.weight.numpy())) + np.testing.assert_array_equal(conv2d1_weight_np, + conv2d2.weight.numpy()) conv2d2.bias.set_value(conv2d1_bias) dy_ret1 = conv2d1(base.to_variable(images)) dy_ret2 = conv2d2(base.to_variable(images)) - self.assertTrue(np.array_equal(dy_ret1.numpy(), - dy_ret2.numpy())) + np.testing.assert_array_equal(dy_ret1.numpy(), dy_ret2.numpy()) conv2d2.weight = conv2d1.weight conv2d2.bias = conv2d1.bias - self.assertTrue( - np.array_equal(conv2d1.weight.numpy(), - conv2d2.weight.numpy())) - self.assertTrue( - np.array_equal(conv2d1.bias.numpy(), conv2d2.bias.numpy())) + np.testing.assert_array_equal(conv2d1.weight.numpy(), + conv2d2.weight.numpy()) + np.testing.assert_array_equal(conv2d1.bias.numpy(), + conv2d2.bias.numpy()) images = np.ones([2, 3, 5, 5], dtype='float32') custom_weight = np.random.randn(3, 3, 2, 2).astype("float32") @@ -572,19 +570,19 @@ def test_type(): self.assertFalse( np.array_equal(conv2d1_weight_np, conv2d2.weight.numpy())) conv2d2.weight.set_value(conv2d1_weight_np) - self.assertTrue( - np.array_equal(conv2d1_weight_np, conv2d2.weight.numpy())) + np.testing.assert_array_equal(conv2d1_weight_np, + conv2d2.weight.numpy()) conv2d2.bias.set_value(conv2d1_bias) dy_ret1 = conv2d1(base.to_variable(images)) dy_ret2 = conv2d2(base.to_variable(images)) - self.assertTrue(np.array_equal(dy_ret1.numpy(), dy_ret2.numpy())) + np.testing.assert_array_equal(dy_ret1.numpy(), dy_ret2.numpy()) conv2d2.weight = conv2d1.weight conv2d2.bias = conv2d1.bias - self.assertTrue( - np.array_equal(conv2d1.weight.numpy(), conv2d2.weight.numpy())) - self.assertTrue( - np.array_equal(conv2d1.bias.numpy(), conv2d2.bias.numpy())) + np.testing.assert_array_equal(conv2d1.weight.numpy(), + conv2d2.weight.numpy()) + np.testing.assert_array_equal(conv2d1.bias.numpy(), + conv2d2.bias.numpy()) def test_gru_unit(self): lod = [[2, 4, 3]] @@ -666,14 +664,14 @@ def test_gru_unit(self): dy_ret2 = gru2(base.to_variable(input), base.to_variable(hidden_input)) for o1, o2 in zip(dy_ret1, dy_ret2): - self.assertTrue(np.array_equal(o1.numpy(), o2.numpy())) + np.testing.assert_array_equal(o1.numpy(), o2.numpy()) gru2.weight = gru1.weight gru2.bias = gru1.bias - self.assertTrue( - np.array_equal(gru1.weight.numpy(), gru2.weight.numpy())) - self.assertTrue( - np.array_equal(gru1.bias.numpy(), gru2.bias.numpy())) + np.testing.assert_array_equal(gru1.weight.numpy(), + gru2.weight.numpy()) + np.testing.assert_array_equal(gru1.bias.numpy(), + gru2.bias.numpy()) custom_weight = np.random.randn(D, D * 3).astype("float32") weight_attr = fluid.ParamAttr(initializer=fluid.initializer. @@ -695,14 +693,13 @@ def test_gru_unit(self): dy_ret2 = gru2(base.to_variable(input), base.to_variable(hidden_input)) for o1, o2 in zip(dy_ret1, dy_ret2): - self.assertTrue(np.array_equal(o1.numpy(), o2.numpy())) + np.testing.assert_array_equal(o1.numpy(), o2.numpy()) gru2.weight = gru1.weight gru2.bias = gru1.bias - self.assertTrue( - np.array_equal(gru1.weight.numpy(), gru2.weight.numpy())) - self.assertTrue(np.array_equal(gru1.bias.numpy(), - gru2.bias.numpy())) + np.testing.assert_array_equal(gru1.weight.numpy(), + gru2.weight.numpy()) + np.testing.assert_array_equal(gru1.bias.numpy(), gru2.bias.numpy()) def test_elementwise_math(self): n = np.ones([3, 3], dtype='float32') @@ -816,8 +813,8 @@ def test_sequence_conv(self): }, fetch_list=[out], with_lod=True)[0] - self.assertTrue( - np.array_equal(np.array(static_rlt), np.array(static_rlt2))) + np.testing.assert_array_equal(np.array(static_rlt), + np.array(static_rlt2)) def test_conv2d_transpose(self): inp_np = np.arange(0, 24).reshape([2, 3, 2, 2]).astype('float32') @@ -889,21 +886,19 @@ def test_conv2d_transpose(self): self.assertFalse( np.array_equal(conv2d1_weight_np, conv2d2.weight.numpy())) conv2d2.weight.set_value(conv2d1_weight_np) - self.assertTrue( - np.array_equal(conv2d1_weight_np, conv2d2.weight.numpy())) + np.testing.assert_array_equal(conv2d1_weight_np, + conv2d2.weight.numpy()) conv2d2.bias.set_value(conv2d1_bias) dy_ret1 = conv2d1(base.to_variable(images)) dy_ret2 = conv2d2(base.to_variable(images)) - self.assertTrue(np.array_equal(dy_ret1.numpy(), - dy_ret2.numpy())) + np.testing.assert_array_equal(dy_ret1.numpy(), dy_ret2.numpy()) conv2d2.weight = conv2d1.weight conv2d2.bias = conv2d1.bias - self.assertTrue( - np.array_equal(conv2d1.weight.numpy(), - conv2d2.weight.numpy())) - self.assertTrue( - np.array_equal(conv2d1.bias.numpy(), conv2d2.bias.numpy())) + np.testing.assert_array_equal(conv2d1.weight.numpy(), + conv2d2.weight.numpy()) + np.testing.assert_array_equal(conv2d1.bias.numpy(), + conv2d2.bias.numpy()) images = np.ones([2, 3, 5, 5], dtype='float32') custom_weight = np.random.randn(3, 3, 2, 2).astype("float32") @@ -925,19 +920,19 @@ def test_conv2d_transpose(self): self.assertFalse( np.array_equal(conv2d1_weight_np, conv2d2.weight.numpy())) conv2d2.weight.set_value(conv2d1_weight_np) - self.assertTrue( - np.array_equal(conv2d1_weight_np, conv2d2.weight.numpy())) + np.testing.assert_array_equal(conv2d1_weight_np, + conv2d2.weight.numpy()) conv2d2.bias.set_value(conv2d1_bias) dy_ret1 = conv2d1(base.to_variable(images)) dy_ret2 = conv2d2(base.to_variable(images)) - self.assertTrue(np.array_equal(dy_ret1.numpy(), dy_ret2.numpy())) + np.testing.assert_array_equal(dy_ret1.numpy(), dy_ret2.numpy()) conv2d2.weight = conv2d1.weight conv2d2.bias = conv2d1.bias - self.assertTrue( - np.array_equal(conv2d1.weight.numpy(), conv2d2.weight.numpy())) - self.assertTrue( - np.array_equal(conv2d1.bias.numpy(), conv2d2.bias.numpy())) + np.testing.assert_array_equal(conv2d1.weight.numpy(), + conv2d2.weight.numpy()) + np.testing.assert_array_equal(conv2d1.bias.numpy(), + conv2d2.bias.numpy()) with self.static_graph(): @@ -1064,11 +1059,11 @@ def test_bilinear_tensor_product(self): }, fetch_list=[out2])[0] - self.assertTrue(np.array_equal(dy_rlt2_value, static_rlt3)) - self.assertTrue(np.array_equal(dy_eager_rlt2_value, static_rlt3)) - self.assertTrue(np.array_equal(static_rlt2, static_rlt)) - self.assertTrue(np.array_equal(dy_rlt_value, static_rlt)) - self.assertTrue(np.array_equal(dy_eager_rlt_value, static_rlt)) + np.testing.assert_array_equal(dy_rlt2_value, static_rlt3) + np.testing.assert_array_equal(dy_eager_rlt2_value, static_rlt3) + np.testing.assert_array_equal(static_rlt2, static_rlt) + np.testing.assert_array_equal(dy_rlt_value, static_rlt) + np.testing.assert_array_equal(dy_eager_rlt_value, static_rlt) with self.dynamic_graph(): with _test_eager_guard(): @@ -1094,15 +1089,14 @@ def test_bilinear_tensor_product(self): base.to_variable(inp_np_y)) dy_rlt2 = btp2(base.to_variable(inp_np_x), base.to_variable(inp_np_y)) - self.assertTrue(np.array_equal(dy_rlt1.numpy(), - dy_rlt2.numpy())) + np.testing.assert_array_equal(dy_rlt1.numpy(), dy_rlt2.numpy()) btp2.weight = btp1.weight btp2.bias = btp1.bias - self.assertTrue( - np.array_equal(btp1.weight.numpy(), btp2.weight.numpy())) - self.assertTrue( - np.array_equal(btp1.bias.numpy(), btp2.bias.numpy())) + np.testing.assert_array_equal(btp1.weight.numpy(), + btp2.weight.numpy()) + np.testing.assert_array_equal(btp1.bias.numpy(), + btp2.bias.numpy()) custom_weight = np.random.randn(6, 3, 3).astype("float32") weight_attr = fluid.ParamAttr(initializer=fluid.initializer. @@ -1124,14 +1118,13 @@ def test_bilinear_tensor_product(self): base.to_variable(inp_np_y)) dy_rlt2 = btp2(base.to_variable(inp_np_x), base.to_variable(inp_np_y)) - self.assertTrue(np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy())) + np.testing.assert_array_equal(dy_rlt1.numpy(), dy_rlt2.numpy()) btp2.weight = btp1.weight btp2.bias = btp1.bias - self.assertTrue( - np.array_equal(btp1.weight.numpy(), btp2.weight.numpy())) - self.assertTrue(np.array_equal(btp1.bias.numpy(), - btp2.bias.numpy())) + np.testing.assert_array_equal(btp1.weight.numpy(), + btp2.weight.numpy()) + np.testing.assert_array_equal(btp1.bias.numpy(), btp2.bias.numpy()) def prelu_test(self, mode): inp_np = np.ones([5, 200, 100, 100]).astype('float32') @@ -1204,13 +1197,11 @@ def prelu_test(self, mode): prelu2.weight.set_value(prelu1.weight.numpy()) dy_rlt1 = prelu1(inp) dy_rlt2 = prelu2(inp) - self.assertTrue(np.array_equal(dy_rlt1.numpy(), - dy_rlt2.numpy())) + np.testing.assert_array_equal(dy_rlt1.numpy(), dy_rlt2.numpy()) prelu2.weight = prelu1.weight - self.assertTrue( - np.array_equal(prelu1.weight.numpy(), - prelu2.weight.numpy())) + np.testing.assert_array_equal(prelu1.weight.numpy(), + prelu2.weight.numpy()) inp_np = np.random.randn(5, 200, 100, 100).astype("float32") inp = base.to_variable(inp_np) @@ -1230,11 +1221,11 @@ def prelu_test(self, mode): prelu2.weight.set_value(prelu1.weight.numpy()) dy_rlt1 = prelu1(inp) dy_rlt2 = prelu2(inp) - self.assertTrue(np.array_equal(dy_rlt1.numpy(), dy_rlt2.numpy())) + np.testing.assert_array_equal(dy_rlt1.numpy(), dy_rlt2.numpy()) prelu2.weight = prelu1.weight - self.assertTrue( - np.array_equal(prelu1.weight.numpy(), prelu2.weight.numpy())) + np.testing.assert_array_equal(prelu1.weight.numpy(), + prelu2.weight.numpy()) def test_prelu(self): self.prelu_test("channel") @@ -1292,16 +1283,16 @@ def test_embeding(self): rep2 = emb2(base.to_variable(inp_word)) self.assertFalse( np.array_equal(emb1.weight.numpy(), custom_weight)) - self.assertTrue( - np.array_equal(emb2.weight.numpy(), custom_weight)) + np.testing.assert_array_equal(emb2.weight.numpy(), + custom_weight) self.assertFalse(np.array_equal(rep1.numpy(), rep2.numpy())) emb2.weight.set_value(emb1.weight.numpy()) rep2 = emb2(base.to_variable(inp_word)) - self.assertTrue(np.array_equal(rep1.numpy(), rep2.numpy())) + np.testing.assert_array_equal(rep1.numpy(), rep2.numpy()) emb2.weight = emb1.weight - self.assertTrue( - np.array_equal(emb1.weight.numpy(), emb2.weight.numpy())) + np.testing.assert_array_equal(emb1.weight.numpy(), + emb2.weight.numpy()) custom_weight = np.random.randn(dict_size, 32).astype("float32") weight_attr = fluid.ParamAttr(initializer=fluid.initializer. @@ -1313,15 +1304,15 @@ def test_embeding(self): rep1 = emb1(base.to_variable(inp_word)) rep2 = emb2(base.to_variable(inp_word)) self.assertFalse(np.array_equal(emb1.weight.numpy(), custom_weight)) - self.assertTrue(np.array_equal(emb2.weight.numpy(), custom_weight)) + np.testing.assert_array_equal(emb2.weight.numpy(), custom_weight) self.assertFalse(np.array_equal(rep1.numpy(), rep2.numpy())) emb2.weight.set_value(emb1.weight.numpy()) rep2 = emb2(base.to_variable(inp_word)) - self.assertTrue(np.array_equal(rep1.numpy(), rep2.numpy())) + np.testing.assert_array_equal(rep1.numpy(), rep2.numpy()) emb2.weight = emb1.weight - self.assertTrue( - np.array_equal(emb1.weight.numpy(), emb2.weight.numpy())) + np.testing.assert_array_equal(emb1.weight.numpy(), + emb2.weight.numpy()) def test_nce(self): window_size = 5 @@ -1543,15 +1534,15 @@ def test_nce(self): nce2.bias.set_value(nce1.bias) nce1_loss = nce1(embs3, wl) nce2_loss = nce2(embs3, wl) - self.assertTrue( - np.array_equal(nce1_loss.numpy(), nce2_loss.numpy())) + np.testing.assert_array_equal(nce1_loss.numpy(), + nce2_loss.numpy()) nce2.weight = nce1.weight nce2.bias = nce1.bias - self.assertTrue( - np.array_equal(nce1.weight.numpy(), nce2.weight.numpy())) - self.assertTrue( - np.array_equal(nce1.bias.numpy(), nce2.bias.numpy())) + np.testing.assert_array_equal(nce1.weight.numpy(), + nce2.weight.numpy()) + np.testing.assert_array_equal(nce1.bias.numpy(), + nce2.bias.numpy()) custom_weight = np.random.randn(dict_size, 128).astype("float32") weight_attr = fluid.ParamAttr(initializer=fluid.initializer. @@ -1605,15 +1596,13 @@ def test_nce(self): nce2.bias.set_value(nce1.bias) nce1_loss = nce1(embs3, wl) nce2_loss = nce2(embs3, wl) - self.assertTrue(np.array_equal(nce1_loss.numpy(), - nce2_loss.numpy())) + np.testing.assert_array_equal(nce1_loss.numpy(), nce2_loss.numpy()) nce2.weight = nce1.weight nce2.bias = nce1.bias - self.assertTrue( - np.array_equal(nce1.weight.numpy(), nce2.weight.numpy())) - self.assertTrue(np.array_equal(nce1.bias.numpy(), - nce2.bias.numpy())) + np.testing.assert_array_equal(nce1.weight.numpy(), + nce2.weight.numpy()) + np.testing.assert_array_equal(nce1.bias.numpy(), nce2.bias.numpy()) def test_one_hot(self): with self.dynamic_graph(): @@ -1623,16 +1612,15 @@ def test_one_hot(self): one_hot_label1 = fluid.layers.one_hot(input=label, depth=4) one_hot_label2 = fluid.layers.one_hot( input=label, depth=fluid.dygraph.to_variable(np.array([4]))) - self.assertTrue( - np.array_equal(one_hot_label1.numpy(), - one_hot_label2.numpy())) + np.testing.assert_array_equal(one_hot_label1.numpy(), + one_hot_label2.numpy()) label = fluid.dygraph.to_variable(np.array([[1], [1], [3], [0]])) one_hot_label1 = fluid.layers.one_hot(input=label, depth=4) one_hot_label2 = fluid.layers.one_hot( input=label, depth=fluid.dygraph.to_variable(np.array([4]))) - self.assertTrue( - np.array_equal(one_hot_label1.numpy(), one_hot_label2.numpy())) + np.testing.assert_array_equal(one_hot_label1.numpy(), + one_hot_label2.numpy()) def test_split(self): with self.dynamic_graph(): @@ -1643,8 +1631,8 @@ def test_split(self): num_or_sections=2, dim=fluid.dygraph.to_variable( np.array([1]))) - self.assertTrue(np.array_equal(x0.numpy(), x00.numpy())) - self.assertTrue(np.array_equal(x1.numpy(), x11.numpy())) + np.testing.assert_array_equal(x0.numpy(), x00.numpy()) + np.testing.assert_array_equal(x1.numpy(), x11.numpy()) input = fluid.dygraph.to_variable(np.random.random((3, 8, 5))) x0, x1 = fluid.layers.split(input, num_or_sections=2, dim=1) @@ -1652,8 +1640,8 @@ def test_split(self): num_or_sections=2, dim=fluid.dygraph.to_variable( np.array([1]))) - self.assertTrue(np.array_equal(x0.numpy(), x00.numpy())) - self.assertTrue(np.array_equal(x1.numpy(), x11.numpy())) + np.testing.assert_array_equal(x0.numpy(), x00.numpy()) + np.testing.assert_array_equal(x1.numpy(), x11.numpy()) def test_topk(self): with self.dynamic_graph(): @@ -1662,20 +1650,19 @@ def test_topk(self): top5_values1, top5_indices1 = layers.topk(input, k=5) top5_values2, top5_indices2 = layers.topk( input, k=fluid.dygraph.to_variable(np.array([5]))) - self.assertTrue( - np.array_equal(top5_values1.numpy(), top5_values2.numpy())) - self.assertTrue( - np.array_equal(top5_indices1.numpy(), - top5_indices2.numpy())) + np.testing.assert_array_equal(top5_values1.numpy(), + top5_values2.numpy()) + np.testing.assert_array_equal(top5_indices1.numpy(), + top5_indices2.numpy()) input = fluid.dygraph.to_variable(np.random.random((13, 11))) top5_values1, top5_indices1 = layers.topk(input, k=5) top5_values2, top5_indices2 = layers.topk( input, k=fluid.dygraph.to_variable(np.array([5]))) - self.assertTrue( - np.array_equal(top5_values1.numpy(), top5_values2.numpy())) - self.assertTrue( - np.array_equal(top5_indices1.numpy(), top5_indices2.numpy())) + np.testing.assert_array_equal(top5_values1.numpy(), + top5_values2.numpy()) + np.testing.assert_array_equal(top5_indices1.numpy(), + top5_indices2.numpy()) def test_conv3d(self): with self.static_graph(): @@ -1737,21 +1724,19 @@ def test_conv3d(self): self.assertFalse( np.array_equal(conv3d1_weight_np, conv3d2.weight.numpy())) conv3d2.weight.set_value(conv3d1_weight_np) - self.assertTrue( - np.array_equal(conv3d1_weight_np, conv3d2.weight.numpy())) + np.testing.assert_array_equal(conv3d1_weight_np, + conv3d2.weight.numpy()) conv3d1.bias.set_value(conv3d1_bias) dy_ret1 = conv3d1(base.to_variable(images)) dy_ret2 = conv3d2(base.to_variable(images)) - self.assertTrue(np.array_equal(dy_ret1.numpy(), - dy_ret2.numpy())) + np.testing.assert_array_equal(dy_ret1.numpy(), dy_ret2.numpy()) conv3d2.weight = conv3d1.weight conv3d2.bias = conv3d1.bias - self.assertTrue( - np.array_equal(conv3d1.weight.numpy(), - conv3d2.weight.numpy())) - self.assertTrue( - np.array_equal(conv3d1.bias.numpy(), conv3d2.bias.numpy())) + np.testing.assert_array_equal(conv3d1.weight.numpy(), + conv3d2.weight.numpy()) + np.testing.assert_array_equal(conv3d1.bias.numpy(), + conv3d2.bias.numpy()) images = np.ones([2, 3, 6, 6, 6], dtype='float32') custom_weight = np.random.randn(3, 3, 2, 2, 2).astype("float32") @@ -1771,19 +1756,19 @@ def test_conv3d(self): self.assertFalse( np.array_equal(conv3d1_weight_np, conv3d2.weight.numpy())) conv3d2.weight.set_value(conv3d1_weight_np) - self.assertTrue( - np.array_equal(conv3d1_weight_np, conv3d2.weight.numpy())) + np.testing.assert_array_equal(conv3d1_weight_np, + conv3d2.weight.numpy()) conv3d1.bias.set_value(conv3d1_bias) dy_ret1 = conv3d1(base.to_variable(images)) dy_ret2 = conv3d2(base.to_variable(images)) - self.assertTrue(np.array_equal(dy_ret1.numpy(), dy_ret2.numpy())) + np.testing.assert_array_equal(dy_ret1.numpy(), dy_ret2.numpy()) conv3d2.weight = conv3d1.weight conv3d2.bias = conv3d1.bias - self.assertTrue( - np.array_equal(conv3d1.weight.numpy(), conv3d2.weight.numpy())) - self.assertTrue( - np.array_equal(conv3d1.bias.numpy(), conv3d2.bias.numpy())) + np.testing.assert_array_equal(conv3d1.weight.numpy(), + conv3d2.weight.numpy()) + np.testing.assert_array_equal(conv3d1.bias.numpy(), + conv3d2.bias.numpy()) def test_row_conv(self): input = np.arange(15).reshape([3, 5]).astype('float32') @@ -2142,17 +2127,14 @@ def test_tree_conv(self): base.to_variable(adj)) dy_ret2 = treeConv2(base.to_variable(vectors), base.to_variable(adj)) - self.assertTrue(np.array_equal(dy_ret1.numpy(), - dy_ret2.numpy())) + np.testing.assert_array_equal(dy_ret1.numpy(), dy_ret2.numpy()) treeConv2.weight = treeConv1.weight treeConv2.bias = treeConv1.bias - self.assertTrue( - np.array_equal(treeConv1.weight.numpy(), - treeConv2.weight.numpy())) - self.assertTrue( - np.array_equal(treeConv1.bias.numpy(), - treeConv2.bias.numpy())) + np.testing.assert_array_equal(treeConv1.weight.numpy(), + treeConv2.weight.numpy()) + np.testing.assert_array_equal(treeConv1.bias.numpy(), + treeConv2.bias.numpy()) custom_weight = np.random.randn(5, 3, 6, 1).astype("float32") weight_attr = fluid.ParamAttr(initializer=fluid.initializer. @@ -2179,15 +2161,14 @@ def test_tree_conv(self): base.to_variable(adj)) dy_ret2 = treeConv2(base.to_variable(vectors), base.to_variable(adj)) - self.assertTrue(np.array_equal(dy_ret1.numpy(), dy_ret2.numpy())) + np.testing.assert_array_equal(dy_ret1.numpy(), dy_ret2.numpy()) treeConv2.weight = treeConv1.weight treeConv2.bias = treeConv1.bias - self.assertTrue( - np.array_equal(treeConv1.weight.numpy(), - treeConv2.weight.numpy())) - self.assertTrue( - np.array_equal(treeConv1.bias.numpy(), treeConv2.bias.numpy())) + np.testing.assert_array_equal(treeConv1.weight.numpy(), + treeConv2.weight.numpy()) + np.testing.assert_array_equal(treeConv1.bias.numpy(), + treeConv2.bias.numpy()) def test_conv3d_transpose(self): input_array = np.arange(0, 48).reshape([2, 3, 2, 2, @@ -2257,21 +2238,19 @@ def test_conv3d_transpose(self): self.assertFalse( np.array_equal(conv3d1_weight_np, conv3d2.weight.numpy())) conv3d2.weight.set_value(conv3d1_weight_np) - self.assertTrue( - np.array_equal(conv3d1_weight_np, conv3d2.weight.numpy())) + np.testing.assert_array_equal(conv3d1_weight_np, + conv3d2.weight.numpy()) conv3d1.bias.set_value(conv3d1_bias) dy_ret1 = conv3d1(base.to_variable(images)) dy_ret2 = conv3d2(base.to_variable(images)) - self.assertTrue(np.array_equal(dy_ret1.numpy(), - dy_ret2.numpy())) + np.testing.assert_array_equal(dy_ret1.numpy(), dy_ret2.numpy()) conv3d2.weight = conv3d1.weight conv3d2.bias = conv3d1.bias - self.assertTrue( - np.array_equal(conv3d1.weight.numpy(), - conv3d2.weight.numpy())) - self.assertTrue( - np.array_equal(conv3d1.bias.numpy(), conv3d2.bias.numpy())) + np.testing.assert_array_equal(conv3d1.weight.numpy(), + conv3d2.weight.numpy()) + np.testing.assert_array_equal(conv3d1.bias.numpy(), + conv3d2.bias.numpy()) images = np.ones([2, 3, 6, 6, 6], dtype='float32') custom_weight = np.random.randn(3, 3, 2, 2, 2).astype("float32") @@ -2297,19 +2276,19 @@ def test_conv3d_transpose(self): self.assertFalse( np.array_equal(conv3d1_weight_np, conv3d2.weight.numpy())) conv3d2.weight.set_value(conv3d1_weight_np) - self.assertTrue( - np.array_equal(conv3d1_weight_np, conv3d2.weight.numpy())) + np.testing.assert_array_equal(conv3d1_weight_np, + conv3d2.weight.numpy()) conv3d1.bias.set_value(conv3d1_bias) dy_ret1 = conv3d1(base.to_variable(images)) dy_ret2 = conv3d2(base.to_variable(images)) - self.assertTrue(np.array_equal(dy_ret1.numpy(), dy_ret2.numpy())) + np.testing.assert_array_equal(dy_ret1.numpy(), dy_ret2.numpy()) conv3d2.weight = conv3d1.weight conv3d2.bias = conv3d1.bias - self.assertTrue( - np.array_equal(conv3d1.weight.numpy(), conv3d2.weight.numpy())) - self.assertTrue( - np.array_equal(conv3d1.bias.numpy(), conv3d2.bias.numpy())) + np.testing.assert_array_equal(conv3d1.weight.numpy(), + conv3d2.weight.numpy()) + np.testing.assert_array_equal(conv3d1.bias.numpy(), + conv3d2.bias.numpy()) def test_eye_op(self): np_eye = np.eye(3, 2) @@ -2398,7 +2377,7 @@ def body2(i): layers.while_loop(cond1, body2, [j]) - self.assertTrue(np.array_equal(static_ret[0], dy_ret[0].numpy())) + np.testing.assert_array_equal(static_ret[0], dy_ret[0].numpy()) def test_while_loop(self): with _test_eager_guard(): @@ -2598,8 +2577,8 @@ def greater_equal_branch(a, b): lambda: less_than_branch(a, b)) eager_dynamic_res = out.numpy() eager_dynamic_res2 = out2.numpy() - self.assertTrue( - np.array_equal(eager_dynamic_res, eager_dynamic_res2)) + np.testing.assert_array_equal(eager_dynamic_res, + eager_dynamic_res2) with self.assertRaises(TypeError): layers.cond(a < b, 'str', 'str') with self.assertRaises(TypeError): @@ -2613,14 +2592,14 @@ def greater_equal_branch(a, b): lambda: less_than_branch(a, b)) dynamic_res = out.numpy() dynamic_res2 = out2.numpy() - self.assertTrue(np.array_equal(dynamic_res, dynamic_res2)) + np.testing.assert_array_equal(dynamic_res, dynamic_res2) with self.assertRaises(TypeError): layers.cond(a < b, 'str', 'str') with self.assertRaises(TypeError): layers.cond(a >= b, 'str', 'str') - self.assertTrue(np.array_equal(static_res, dynamic_res)) - self.assertTrue(np.array_equal(static_res, eager_dynamic_res)) + np.testing.assert_array_equal(static_res, dynamic_res) + np.testing.assert_array_equal(static_res, eager_dynamic_res) def test_case(self): @@ -2683,10 +2662,10 @@ def fn_3(): dynamic_res1 = out_1.numpy() dynamic_res2 = out_2.numpy() - self.assertTrue(np.array_equal(static_res1, dynamic_res1)) - self.assertTrue(np.array_equal(static_res2, dynamic_res2)) - self.assertTrue(np.array_equal(static_res1, eager_dynamic_res1)) - self.assertTrue(np.array_equal(static_res2, eager_dynamic_res2)) + np.testing.assert_array_equal(static_res1, dynamic_res1) + np.testing.assert_array_equal(static_res2, dynamic_res2) + np.testing.assert_array_equal(static_res1, eager_dynamic_res1) + np.testing.assert_array_equal(static_res2, eager_dynamic_res2) def test_switch_case(self): @@ -2768,12 +2747,12 @@ def fn_3(): dynamic_res2 = out_2.numpy() dynamic_res3 = out_3.numpy() - self.assertTrue(np.array_equal(static_res1, dynamic_res1)) - self.assertTrue(np.array_equal(static_res2, dynamic_res2)) - self.assertTrue(np.array_equal(static_res3, dynamic_res3)) - self.assertTrue(np.array_equal(static_res1, eager_dynamic_res1)) - self.assertTrue(np.array_equal(static_res2, eager_dynamic_res2)) - self.assertTrue(np.array_equal(static_res3, eager_dynamic_res3)) + np.testing.assert_array_equal(static_res1, dynamic_res1) + np.testing.assert_array_equal(static_res2, dynamic_res2) + np.testing.assert_array_equal(static_res3, dynamic_res3) + np.testing.assert_array_equal(static_res1, eager_dynamic_res1) + np.testing.assert_array_equal(static_res2, eager_dynamic_res2) + np.testing.assert_array_equal(static_res3, eager_dynamic_res3) def test_crop_tensor(self): with self.static_graph(): @@ -2848,7 +2827,7 @@ def test_accuracy(self): predict = fluid.layers.softmax(fc_out) dynamic_out = fluid.layers.accuracy(input=predict, label=label, k=5) - self.assertTrue(np.array_equal(static_out[0], dynamic_out.numpy())) + np.testing.assert_array_equal(static_out[0], dynamic_out.numpy()) class TestBook(LayerTest): @@ -2909,9 +2888,11 @@ def func_all_layers(self): continue if method.__name__ not in self.not_compare_static_dygraph_set: - self.assertTrue( - np.array_equal(static_result[0], dy_result_value), - "Result of function [{}] not equal".format(method.__name__)) + np.testing.assert_array_equal( + static_result[0], + dy_result_value, + err_msg='Result of function [{}] not equal'.format( + method.__name__)) def test_all_layers(self): with _test_eager_guard(): @@ -4210,8 +4191,8 @@ def test_roi_pool(self): 0.5, rois_num=rois_num_dy) dy_res_value = dy_res[0].numpy() - self.assertTrue(np.array_equal(static_res, dy_res_value)) - self.assertTrue(np.array_equal(static_res, dy_eager_res_value)) + np.testing.assert_array_equal(static_res, dy_res_value) + np.testing.assert_array_equal(static_res, dy_eager_res_value) def test_sequence_enumerate(self): # TODO(minqiyang): dygraph do not support lod now @@ -4261,8 +4242,8 @@ def test_roi_align(self): 2, rois_num=rois_num_dy) dy_res_value = dy_res.numpy() - self.assertTrue(np.array_equal(static_res, dy_eager_res_value)) - self.assertTrue(np.array_equal(static_res, dy_res_value)) + np.testing.assert_array_equal(static_res, dy_eager_res_value) + np.testing.assert_array_equal(static_res, dy_res_value) def test_dice_loss(self): num_classes = 4 @@ -4295,8 +4276,8 @@ def test_dice_loss(self): label_ = base.to_variable(label_np) dy_res = layers.dice_loss(input_, label_, eps) dy_res_value = dy_res.numpy() - self.assertTrue(np.array_equal(static_res, dy_res_value)) - self.assertTrue(np.array_equal(static_res, dy_eager_res_value)) + np.testing.assert_array_equal(static_res, dy_res_value) + np.testing.assert_array_equal(static_res, dy_eager_res_value) def test_roi_perspective_transform(self): # TODO(minqiyang): dygraph do not support lod now diff --git a/python/paddle/fluid/tests/unittests/test_load_op.py b/python/paddle/fluid/tests/unittests/test_load_op.py index a9865251355b9..7fd09be07781e 100644 --- a/python/paddle/fluid/tests/unittests/test_load_op.py +++ b/python/paddle/fluid/tests/unittests/test_load_op.py @@ -62,7 +62,7 @@ def test_load(self): exe = fluid.Executor(fluid.CPUPlace()) exe.run(start_prog) ret = exe.run(main_prog, fetch_list=[var.name]) - self.assertTrue(np.array_equal(self.ones, ret[0])) + np.testing.assert_array_equal(self.ones, ret[0]) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_load_op_xpu.py b/python/paddle/fluid/tests/unittests/test_load_op_xpu.py index 8d7f65116b63f..51799813fb6c2 100644 --- a/python/paddle/fluid/tests/unittests/test_load_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/test_load_op_xpu.py @@ -64,7 +64,7 @@ def test_load_xpu(self): exe = fluid.Executor(fluid.XPUPlace(0)) exe.run(start_prog) ret = exe.run(main_prog, fetch_list=[var.name]) - self.assertTrue(np.array_equal(self.ones, ret[0])) + np.testing.assert_array_equal(self.ones, ret[0]) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py index 0005ccb4ab6a6..804eb9887879a 100644 --- a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py +++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py @@ -123,7 +123,7 @@ def train_and_save_model(self, only_params=False): def check_load_state_dict(self, orig_dict, load_dict): for var_name, value in six.iteritems(orig_dict): - self.assertTrue(np.array_equal(value, load_dict[var_name])) + np.testing.assert_array_equal(value, load_dict[var_name]) def test_load_default(self): self.save_dirname = os.path.join( diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py index 793d0e9bf5ab8..c7a118b3731f7 100644 --- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py +++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py @@ -17,7 +17,7 @@ import unittest import paddle import paddle.fluid.core as core -import numpy +import numpy as np class TestLoDTensorArray(unittest.TestCase): @@ -30,7 +30,7 @@ def test_get_set(self): cpu = core.CPUPlace() for i in range(10): t = core.LoDTensor() - t.set(numpy.array([i], dtype='float32'), cpu) + t.set(np.array([i], dtype='float32'), cpu) t.set_recursive_sequence_lengths([[1]]) tensor_array.append(t) @@ -38,16 +38,15 @@ def test_get_set(self): for i in range(10): t = tensor_array[i] - self.assertEqual(numpy.array(t), numpy.array([i], dtype='float32')) + self.assertEqual(np.array(t), np.array([i], dtype='float32')) self.assertEqual([[1]], t.recursive_sequence_lengths()) t = core.LoDTensor() - t.set(numpy.array([i + 10], dtype='float32'), cpu) + t.set(np.array([i + 10], dtype='float32'), cpu) t.set_recursive_sequence_lengths([[1]]) tensor_array[i] = t t = tensor_array[i] - self.assertEqual(numpy.array(t), - numpy.array([i + 10], dtype='float32')) + self.assertEqual(np.array(t), np.array([i + 10], dtype='float32')) self.assertEqual([[1]], t.recursive_sequence_lengths()) @@ -60,13 +59,12 @@ def setUp(self): def test_initialized_list_and_error(self): paddle.disable_static() init_data = [ - numpy.random.random(shape).astype('float32') - for shape in self.shapes + np.random.random(shape).astype('float32') for shape in self.shapes ] array = paddle.tensor.create_array( 'float32', [paddle.to_tensor(x) for x in init_data]) for res, gt in zip(array, init_data): - self.assertTrue(numpy.array_equal(res, gt)) + np.testing.assert_array_equal(res, gt) # test for None array = paddle.tensor.create_array('float32') diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py index 9dc7c1aa63656..d5da01d47a869 100644 --- a/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py +++ b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py @@ -231,12 +231,12 @@ def setUp(self): def test_embedding_weights(self): result = convert_uint16_to_float(self.result[0]) - self.assertTrue(np.array_equal(self.w_fp32, result)) + np.testing.assert_array_equal(self.w_fp32, result) def test_lookup_results(self): lookup_result = convert_uint16_to_float(self.result[1]) lookup_ref = _lookup(self.w_fp32, self.ids, self.flat_ids) - self.assertTrue(np.array_equal(lookup_result, lookup_ref)) + np.testing.assert_array_equal(lookup_result, lookup_ref) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_bf16_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_bf16_op.py index 06b232443a8ea..d9bca121656d8 100644 --- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_bf16_op.py +++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_bf16_op.py @@ -119,12 +119,12 @@ def setUp(self): def test_embedding_weights(self): result = convert_uint16_to_float(self.result[0]) - self.assertTrue(np.array_equal(self.w_fp32, result)) + np.testing.assert_array_equal(self.w_fp32, result) def test_lookup_results(self): lookup_result = convert_uint16_to_float(self.result[1]) lookup_ref = _lookup(self.w_fp32, self.ids, self.flat_ids, self.op_type) - self.assertTrue(np.array_equal(lookup_result, lookup_ref)) + np.testing.assert_array_equal(lookup_result, lookup_ref) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py index c337736c88144..fc84ac7cafa7e 100644 --- a/python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py +++ b/python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py @@ -66,12 +66,36 @@ def margin_cross_entropy(logits, return loss, softmax +def python_api(logits, + label, + return_softmax=False, + ring_id=0, + rank=0, + nrank=0, + margin1=1.0, + margin2=0.5, + margin3=0.0, + scale=64.0): + return paddle.nn.functional.margin_cross_entropy( + logits, + label, + return_softmax=return_softmax, + margin1=margin1, + margin2=margin2, + margin3=margin3, + scale=scale, + group=None, + reduction=None) + + @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestMarginCrossEntropyOp(OpTest): def initParams(self): + self.python_api = python_api self.op_type = "margin_cross_entropy" + self.python_out_sig = ["Loss"] self.axis = -1 self.batch_dim = 5 self.feat_dim = 41 @@ -121,10 +145,14 @@ def setUp(self): } def test_check_output(self): - self.check_output_with_place(core.CUDAPlace(0), atol=1e-5) + self.check_output_with_place(core.CUDAPlace(0), + atol=1e-5, + check_eager=True) def test_check_grad(self): - self.check_grad_with_place(core.CUDAPlace(0), ["Logits"], "Loss") + self.check_grad_with_place(core.CUDAPlace(0), ["Logits"], + "Loss", + check_eager=True) @unittest.skipIf(not core.is_compiled_with_cuda(), @@ -138,7 +166,8 @@ def test_check_grad(self): self.check_grad_with_place(core.CUDAPlace(0), ["Logits"], "Loss", numeric_grad_delta=5e-2, - max_relative_error=5e-2) + max_relative_error=5e-2, + check_eager=True) @unittest.skipIf(not core.is_compiled_with_cuda(), @@ -149,13 +178,16 @@ def init_dtype(self): self.dtype = np.float16 def test_check_output(self): - self.check_output_with_place(core.CUDAPlace(0), atol=5e-2) + self.check_output_with_place(core.CUDAPlace(0), + atol=5e-2, + check_eager=True) def test_check_grad(self): self.check_grad_with_place(core.CUDAPlace(0), ["Logits"], "Loss", numeric_grad_delta=6e-1, - max_relative_error=6e-1) + max_relative_error=6e-1, + check_eager=True) @unittest.skipIf(not core.is_compiled_with_cuda(), @@ -184,13 +216,17 @@ class TestMarginCrossEntropyOpCPU(TestMarginCrossEntropyOp): def test_check_output(self): try: - self.check_output_with_place(core.CPUPlace(), atol=1e-5) + self.check_output_with_place(core.CPUPlace(), + atol=1e-5, + check_eager=True) except RuntimeError: pass def test_check_grad(self): try: - self.check_grad_with_place(core.CPUPlace(), ["Logits"], "Loss") + self.check_grad_with_place(core.CPUPlace(), ["Logits"], + "Loss", + check_eager=True) except RuntimeError: pass @@ -208,6 +244,7 @@ def setUp(self): self.places.append(paddle.fluid.CUDAPlace(0)) def initParams(self): + self.python_out_sig = ["Loss"] self.seed = 2021 self.axis = -1 self.batch_dim = 5 @@ -356,6 +393,8 @@ def setUp(self): self.places.append(paddle.fluid.CUDAPlace(0)) def initParams(self): + self.python_api = python_api + self.python_out_sig = ["Loss"] self.seed = 2021 self.axis = -1 self.batch_dim = 10 diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch.py b/python/paddle/fluid/tests/unittests/test_math_op_patch.py index 9dd47647a1a24..037ca7aa06657 100644 --- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py +++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py @@ -37,15 +37,15 @@ def test_add_scalar(self): # e = a + ab place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.random.random(size=[10, 1]).astype('float32') + a_np = np.random.random(size=[10, 1]).astype('float32') b_np, c_np, d_np = exe.run(fluid.default_main_program(), feed={"a": a_np}, fetch_list=[b, c, d]) - self.assertTrue(numpy.allclose(a_np + 10, b_np)) - ab_np = numpy.concatenate([a_np, b_np], axis=1) - self.assertTrue(numpy.allclose(ab_np + 10, c_np)) - d_expected = ab_np + numpy.concatenate([a_np, a_np], axis=1) - self.assertTrue(numpy.allclose(d_expected, d_np)) + self.assertTrue(np.allclose(a_np + 10, b_np)) + ab_np = np.concatenate([a_np, b_np], axis=1) + self.assertTrue(np.allclose(ab_np + 10, c_np)) + d_expected = ab_np + np.concatenate([a_np, a_np], axis=1) + self.assertTrue(np.allclose(d_expected, d_np)) @prog_scope() def test_radd_scalar(self): @@ -53,11 +53,11 @@ def test_radd_scalar(self): b = 10 + a place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.random.random(size=[10, 1]).astype('float32') + a_np = np.random.random(size=[10, 1]).astype('float32') b_np = exe.run(fluid.default_main_program(), feed={"a": a_np}, fetch_list=[b]) - self.assertTrue(numpy.allclose(a_np + 10, b_np)) + self.assertTrue(np.allclose(a_np + 10, b_np)) @prog_scope() def test_sub_scalar(self): @@ -65,11 +65,11 @@ def test_sub_scalar(self): b = a - 10 place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.random.random(size=[10, 1]).astype('float32') + a_np = np.random.random(size=[10, 1]).astype('float32') b_np = exe.run(fluid.default_main_program(), feed={"a": a_np}, fetch_list=[b]) - self.assertTrue(numpy.allclose(a_np - 10, b_np)) + self.assertTrue(np.allclose(a_np - 10, b_np)) @prog_scope() def test_radd_scalar(self): @@ -77,11 +77,11 @@ def test_radd_scalar(self): b = 10 - a place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.random.random(size=[10, 1]).astype('float32') + a_np = np.random.random(size=[10, 1]).astype('float32') b_np = exe.run(fluid.default_main_program(), feed={"a": a_np}, fetch_list=[b]) - self.assertTrue(numpy.allclose(10 - a_np, b_np)) + self.assertTrue(np.allclose(10 - a_np, b_np)) @prog_scope() def test_mul_scalar(self): @@ -89,11 +89,11 @@ def test_mul_scalar(self): b = a * 10 place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.random.random(size=[10, 1]).astype('float32') + a_np = np.random.random(size=[10, 1]).astype('float32') b_np = exe.run(fluid.default_main_program(), feed={"a": a_np}, fetch_list=[b]) - self.assertTrue(numpy.allclose(a_np * 10, b_np)) + self.assertTrue(np.allclose(a_np * 10, b_np)) @prog_scope() def test_rmul_scalar(self): @@ -101,11 +101,11 @@ def test_rmul_scalar(self): b = 10 * a place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.random.random(size=[10, 1]).astype('float32') + a_np = np.random.random(size=[10, 1]).astype('float32') b_np = exe.run(fluid.default_main_program(), feed={"a": a_np}, fetch_list=[b]) - self.assertTrue(numpy.allclose(10 * a_np, b_np)) + self.assertTrue(np.allclose(10 * a_np, b_np)) @prog_scope() def test_div_scalar(self): @@ -113,11 +113,11 @@ def test_div_scalar(self): b = a / 10 place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.random.random(size=[10, 1]).astype('float32') + a_np = np.random.random(size=[10, 1]).astype('float32') b_np = exe.run(fluid.default_main_program(), feed={"a": a_np}, fetch_list=[b]) - self.assertTrue(numpy.allclose(a_np / 10, b_np)) + self.assertTrue(np.allclose(a_np / 10, b_np)) @prog_scope() def test_rdiv_scalar(self): @@ -125,12 +125,12 @@ def test_rdiv_scalar(self): b = 10 / a place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.random.random(size=[10, 1]).astype('float32') + 1e-2 + a_np = np.random.random(size=[10, 1]).astype('float32') + 1e-2 b_np = exe.run(fluid.default_main_program(), feed={"a": a_np}, fetch_list=[b]) - self.assertTrue(numpy.allclose(10 / a_np, b_np)) + self.assertTrue(np.allclose(10 / a_np, b_np)) @prog_scope() def test_div_two_tensor(self): @@ -139,15 +139,15 @@ def test_div_two_tensor(self): c = a / b place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.random.random(size=[10, 1]).astype('float32') - b_np = numpy.random.random(size=[10, 1]).astype('float32') + 1e-2 + a_np = np.random.random(size=[10, 1]).astype('float32') + b_np = np.random.random(size=[10, 1]).astype('float32') + 1e-2 c_np = exe.run(fluid.default_main_program(), feed={ "a": a_np, 'b': b_np }, fetch_list=[c]) - self.assertTrue(numpy.allclose(a_np / b_np, c_np)) + self.assertTrue(np.allclose(a_np / b_np, c_np)) @prog_scope() def test_mul_two_tensor(self): @@ -156,15 +156,15 @@ def test_mul_two_tensor(self): c = a * b place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.random.random(size=[10, 1]).astype('float32') - b_np = numpy.random.random(size=[10, 1]).astype('float32') + a_np = np.random.random(size=[10, 1]).astype('float32') + b_np = np.random.random(size=[10, 1]).astype('float32') c_np = exe.run(fluid.default_main_program(), feed={ "a": a_np, 'b': b_np }, fetch_list=[c]) - self.assertTrue(numpy.allclose(a_np * b_np, c_np)) + self.assertTrue(np.allclose(a_np * b_np, c_np)) @prog_scope() def test_add_two_tensor(self): @@ -173,15 +173,15 @@ def test_add_two_tensor(self): c = a + b place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.random.random(size=[10, 1]).astype('float32') - b_np = numpy.random.random(size=[10, 1]).astype('float32') + a_np = np.random.random(size=[10, 1]).astype('float32') + b_np = np.random.random(size=[10, 1]).astype('float32') c_np = exe.run(fluid.default_main_program(), feed={ "a": a_np, 'b': b_np }, fetch_list=[c]) - self.assertTrue(numpy.allclose(a_np + b_np, c_np)) + self.assertTrue(np.allclose(a_np + b_np, c_np)) @prog_scope() def test_sub_two_tensor(self): @@ -190,15 +190,15 @@ def test_sub_two_tensor(self): c = a - b place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.random.random(size=[10, 1]).astype('float32') - b_np = numpy.random.random(size=[10, 1]).astype('float32') + a_np = np.random.random(size=[10, 1]).astype('float32') + b_np = np.random.random(size=[10, 1]).astype('float32') c_np = exe.run(fluid.default_main_program(), feed={ "a": a_np, 'b': b_np }, fetch_list=[c]) - self.assertTrue(numpy.allclose(a_np - b_np, c_np)) + self.assertTrue(np.allclose(a_np - b_np, c_np)) @prog_scope() def test_integer_div(self): @@ -206,13 +206,13 @@ def test_integer_div(self): b = a / 7 place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.array([3, 4, 10, 14, 9, 18]).astype('int64') + a_np = np.array([3, 4, 10, 14, 9, 18]).astype('int64') b_np, = exe.run(fluid.default_main_program(), feed={"a": a_np}, fetch_list=[b]) b_np_actual = (a_np / 7).astype('float32') - self.assertTrue(numpy.allclose(b_np, b_np_actual)) + self.assertTrue(np.allclose(b_np, b_np_actual)) @prog_scope() def test_equal(self): @@ -222,8 +222,8 @@ def test_equal(self): place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.array([3, 4, 10, 14, 9, 18]).astype('float32') - b_np = numpy.array([3, 4, 11, 15, 8, 18]).astype('float32') + a_np = np.array([3, 4, 10, 14, 9, 18]).astype('float32') + b_np = np.array([3, 4, 11, 15, 8, 18]).astype('float32') c_np, = exe.run(fluid.default_main_program(), feed={ @@ -232,7 +232,7 @@ def test_equal(self): }, fetch_list=[c]) - self.assertTrue(numpy.array_equal(c_np, a_np == b_np)) + np.testing.assert_array_equal(c_np, a_np == b_np) self.assertEqual(c.dtype, fluid.core.VarDesc.VarType.BOOL) @prog_scope() @@ -247,8 +247,8 @@ def test_equal_and_cond(self): place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.array([3, 4, 10, 14, 9, 18]).astype('float') - b_np = numpy.array([3, 4, 11, 15, 8, 18]).astype('float') + a_np = np.array([3, 4, 10, 14, 9, 18]).astype('float') + b_np = np.array([3, 4, 11, 15, 8, 18]).astype('float') c_np, = exe.run(fluid.default_main_program(), feed={ "a": a_np, @@ -256,7 +256,7 @@ def test_equal_and_cond(self): }, fetch_list=[c]) - self.assertTrue(numpy.array_equal(c_np, a_np - b_np)) + np.testing.assert_array_equal(c_np, a_np - b_np) @prog_scope() def test_neg(self): @@ -264,12 +264,12 @@ def test_neg(self): b = -a place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.random.uniform(-1, 1, size=[10, 1]).astype('float32') + a_np = np.random.uniform(-1, 1, size=[10, 1]).astype('float32') b_np = exe.run(fluid.default_main_program(), feed={"a": a_np}, fetch_list=[b]) - self.assertTrue(numpy.allclose(-a_np, b_np)) + self.assertTrue(np.allclose(-a_np, b_np)) @prog_scope() def test_astype(self): @@ -277,12 +277,12 @@ def test_astype(self): b = a.astype('float32') place = fluid.CPUPlace() exe = fluid.Executor(place) - a_np = numpy.random.uniform(-1, 1, size=[10, 1]).astype('float64') + a_np = np.random.uniform(-1, 1, size=[10, 1]).astype('float64') b_np = exe.run(fluid.default_main_program(), feed={"a": a_np}, fetch_list=[b]) - self.assertTrue(numpy.allclose(a_np.astype('float32'), b_np)) + self.assertTrue(np.allclose(a_np.astype('float32'), b_np)) def test_bitwise_and(self): x_np = np.random.randint(-100, 100, [2, 3, 5]).astype("int32") @@ -300,7 +300,7 @@ def test_bitwise_and(self): "y": y_np }, fetch_list=[z]) - self.assertTrue(np.array_equal(out[0], out_np)) + np.testing.assert_array_equal(out[0], out_np) @prog_scope() def test_bitwise_or(self): @@ -319,7 +319,7 @@ def test_bitwise_or(self): "y": y_np }, fetch_list=[z]) - self.assertTrue(np.array_equal(out[0], out_np)) + np.testing.assert_array_equal(out[0], out_np) @prog_scope() def test_bitwise_xor(self): @@ -338,7 +338,7 @@ def test_bitwise_xor(self): "y": y_np }, fetch_list=[z]) - self.assertTrue(np.array_equal(out[0], out_np)) + np.testing.assert_array_equal(out[0], out_np) @prog_scope() def test_bitwise_not(self): @@ -352,7 +352,7 @@ def test_bitwise_not(self): out = exe.run(fluid.default_main_program(), feed={"x": x_np}, fetch_list=[z]) - self.assertTrue(np.array_equal(out[0], out_np)) + np.testing.assert_array_equal(out[0], out_np) @prog_scope() def test_T(self): @@ -366,7 +366,7 @@ def test_T(self): out = exe.run(fluid.default_main_program(), feed={"x": x_np}, fetch_list=[z]) - self.assertTrue(np.array_equal(out[0], out_np)) + np.testing.assert_array_equal(out[0], out_np) @prog_scope() def test_ndim(self): @@ -380,8 +380,8 @@ def test_matmul(self): a = paddle.static.data(name='a', shape=[2, 3], dtype='float32') b = paddle.static.data(name='b', shape=[3, 5], dtype='float32') c = a @ b # __matmul__ - a_np = numpy.random.uniform(-1, 1, size=[2, 3]).astype('float32') - b_np = numpy.random.uniform(-1, 1, size=[3, 5]).astype('float32') + a_np = np.random.uniform(-1, 1, size=[2, 3]).astype('float32') + b_np = np.random.uniform(-1, 1, size=[3, 5]).astype('float32') place = paddle.CPUPlace() exe = paddle.static.Executor(place) c_np = exe.run(paddle.static.default_main_program(), @@ -390,7 +390,7 @@ def test_matmul(self): "b": b_np }, fetch_list=[c]) - self.assertTrue(numpy.allclose(a_np @ b_np, c_np)) + self.assertTrue(np.allclose(a_np @ b_np, c_np)) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py index 92fa9049dab54..fd768b1516f3f 100644 --- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py +++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py @@ -35,7 +35,7 @@ def func_test_add(self): a = fluid.dygraph.to_variable(a_np) b = fluid.dygraph.to_variable(b_np) res = a + b - self.assertTrue(np.array_equal(res.numpy(), a_np + b_np)) + np.testing.assert_array_equal(res.numpy(), a_np + b_np) def test_add(self): with _test_eager_guard(): @@ -49,7 +49,7 @@ def func_test_sub(self): a = fluid.dygraph.to_variable(a_np) b = fluid.dygraph.to_variable(b_np) res = a - b - self.assertTrue(np.array_equal(res.numpy(), a_np - b_np)) + np.testing.assert_array_equal(res.numpy(), a_np - b_np) def test_sub(self): with _test_eager_guard(): @@ -63,7 +63,7 @@ def func_test_mul(self): a = fluid.dygraph.to_variable(a_np) b = fluid.dygraph.to_variable(b_np) res = a * b - self.assertTrue(np.array_equal(res.numpy(), a_np * b_np)) + np.testing.assert_array_equal(res.numpy(), a_np * b_np) def test_mul(self): with _test_eager_guard(): @@ -91,7 +91,7 @@ def func_test_add_scalar(self): a = fluid.dygraph.to_variable(a_np) b = 0.1 res = a + b - self.assertTrue(np.array_equal(res.numpy(), a_np + b)) + np.testing.assert_array_equal(res.numpy(), a_np + b) def test_add_scalar(self): with _test_eager_guard(): @@ -104,7 +104,7 @@ def func_test_add_scalar_reverse(self): a = fluid.dygraph.to_variable(a_np) b = 0.1 res = b + a - self.assertTrue(np.array_equal(res.numpy(), b + a_np)) + np.testing.assert_array_equal(res.numpy(), b + a_np) def test_add_scalar_reverse(self): with _test_eager_guard(): @@ -117,7 +117,7 @@ def func_test_sub_scalar(self): a = fluid.dygraph.to_variable(a_np) b = 0.1 res = a - b - self.assertTrue(np.array_equal(res.numpy(), a_np - b)) + np.testing.assert_array_equal(res.numpy(), a_np - b) def test_sub_scalar(self): with _test_eager_guard(): @@ -130,7 +130,7 @@ def func_test_sub_scalar_reverse(self): a = fluid.dygraph.to_variable(a_np) b = 0.1 res = b - a - self.assertTrue(np.array_equal(res.numpy(), b - a_np)) + np.testing.assert_array_equal(res.numpy(), b - a_np) def test_sub_scalar_reverse(self): with _test_eager_guard(): @@ -143,7 +143,7 @@ def func_test_mul_scalar(self): a = fluid.dygraph.to_variable(a_np) b = 0.1 res = a * b - self.assertTrue(np.array_equal(res.numpy(), a_np * b)) + np.testing.assert_array_equal(res.numpy(), a_np * b) def test_mul_scalar(self): with _test_eager_guard(): @@ -186,7 +186,7 @@ def func_test_floor_div(self): a = fluid.dygraph.to_variable(a_np) b = fluid.dygraph.to_variable(b_np) res = a // b - self.assertTrue(np.array_equal(res.numpy(), a_np // b_np)) + np.testing.assert_array_equal(res.numpy(), a_np // b_np) def test_floor_div(self): with _test_eager_guard(): @@ -200,7 +200,7 @@ def func_test_mod(self): a = fluid.dygraph.to_variable(a_np) b = fluid.dygraph.to_variable(b_np) res = a % b - self.assertTrue(np.array_equal(res.numpy(), a_np % b_np)) + np.testing.assert_array_equal(res.numpy(), a_np % b_np) def test_mod(self): with _test_eager_guard(): @@ -218,19 +218,19 @@ def func_test_bitwise(self): out_np = x_np & y_np out = x & y - self.assertTrue(np.array_equal(out.numpy(), out_np)) + np.testing.assert_array_equal(out.numpy(), out_np) out_np = x_np | y_np out = x | y - self.assertTrue(np.array_equal(out.numpy(), out_np)) + np.testing.assert_array_equal(out.numpy(), out_np) out_np = x_np ^ y_np out = x ^ y - self.assertTrue(np.array_equal(out.numpy(), out_np)) + np.testing.assert_array_equal(out.numpy(), out_np) out_np = ~x_np out = ~x - self.assertTrue(np.array_equal(out.numpy(), out_np)) + np.testing.assert_array_equal(out.numpy(), out_np) def test_bitwise(self): with _test_eager_guard(): @@ -248,8 +248,8 @@ def func_test_equal(self): c = fluid.dygraph.to_variable(c_np) res1 = (a == b) res2 = (a == c) - self.assertTrue(np.array_equal(res1.numpy(), a_np == b_np)) - self.assertTrue(np.array_equal(res2.numpy(), a_np == c_np)) + np.testing.assert_array_equal(res1.numpy(), a_np == b_np) + np.testing.assert_array_equal(res2.numpy(), a_np == c_np) def test_equal(self): with _test_eager_guard(): @@ -266,8 +266,8 @@ def func_test_not_equal(self): c = fluid.dygraph.to_variable(c_np) res1 = (a != b) res2 = (a != c) - self.assertTrue(np.array_equal(res1.numpy(), a_np != b_np)) - self.assertTrue(np.array_equal(res2.numpy(), a_np != c_np)) + np.testing.assert_array_equal(res1.numpy(), a_np != b_np) + np.testing.assert_array_equal(res2.numpy(), a_np != c_np) def test_not_equal(self): with _test_eager_guard(): @@ -281,7 +281,7 @@ def func_test_less_than(self): a = fluid.dygraph.to_variable(a_np) b = fluid.dygraph.to_variable(b_np) res = (a < b) - self.assertTrue(np.array_equal(res.numpy(), a_np < b_np)) + np.testing.assert_array_equal(res.numpy(), a_np < b_np) def test_less_than(self): with _test_eager_guard(): @@ -295,7 +295,7 @@ def func_test_less_equal(self): a = fluid.dygraph.to_variable(a_np) b = fluid.dygraph.to_variable(b_np) res = (a <= b) - self.assertTrue(np.array_equal(res.numpy(), a_np <= b_np)) + np.testing.assert_array_equal(res.numpy(), a_np <= b_np) def test_less_equal(self): with _test_eager_guard(): @@ -309,7 +309,7 @@ def func_test_greater_than(self): a = fluid.dygraph.to_variable(a_np) b = fluid.dygraph.to_variable(b_np) res = (a > b) - self.assertTrue(np.array_equal(res.numpy(), a_np > b_np)) + np.testing.assert_array_equal(res.numpy(), a_np > b_np) def test_greater_than(self): with _test_eager_guard(): @@ -323,7 +323,7 @@ def func_test_greater_equal(self): a = fluid.dygraph.to_variable(a_np) b = fluid.dygraph.to_variable(b_np) res = (a >= b) - self.assertTrue(np.array_equal(res.numpy(), a_np >= b_np)) + np.testing.assert_array_equal(res.numpy(), a_np >= b_np) def test_greater_equal(self): with _test_eager_guard(): @@ -335,7 +335,7 @@ def func_test_neg(self): with fluid.dygraph.guard(): a = fluid.dygraph.to_variable(a_np) res = -a - self.assertTrue(np.array_equal(res.numpy(), -a_np)) + np.testing.assert_array_equal(res.numpy(), -a_np) def test_neg(self): with _test_eager_guard(): @@ -406,7 +406,7 @@ def func_test_add_different_dtype(self): a = fluid.dygraph.to_variable(a_np) b = fluid.dygraph.to_variable(b_np) res = a + b - self.assertTrue(np.array_equal(res.numpy(), a_np + b_np)) + np.testing.assert_array_equal(res.numpy(), a_np + b_np) def test_add_different_dtype(self): with _test_eager_guard(): @@ -420,7 +420,7 @@ def func_test_floordiv_different_dtype(self): a = paddle.to_tensor(a_np) b = paddle.to_tensor(b_np) res = a // b - self.assertTrue(np.array_equal(res.numpy(), a_np // b_np)) + np.testing.assert_array_equal(res.numpy(), a_np // b_np) def test_floordiv_different_dtype(self): with _test_eager_guard(): @@ -438,8 +438,8 @@ def func_test_astype(self): self.assertEqual(res1.dtype, res2.dtype) self.assertEqual(res1.dtype, res3.dtype) - self.assertTrue(np.array_equal(res1.numpy(), res2.numpy())) - self.assertTrue(np.array_equal(res1.numpy(), res3.numpy())) + np.testing.assert_array_equal(res1.numpy(), res2.numpy()) + np.testing.assert_array_equal(res1.numpy(), res3.numpy()) def test_astype(self): with _test_eager_guard(): @@ -454,7 +454,7 @@ def func_test_conpare_op_broadcast(self): b = fluid.dygraph.to_variable(b_np) self.assertEqual((a != b).dtype, fluid.core.VarDesc.VarType.BOOL) - self.assertTrue(np.array_equal((a != b).numpy(), a_np != b_np)) + np.testing.assert_array_equal((a != b).numpy(), a_np != b_np) def test_conpare_op_broadcast(self): with _test_eager_guard(): @@ -480,248 +480,195 @@ def func_test_tensor_patch_method(self): self.assertEqual(x.ndim, 2) self.assertEqual(x.size, 6) self.assertEqual(x.numel(), 6) - self.assertTrue(np.array_equal(x.exp().numpy(), paddle.exp(x).numpy())) - self.assertTrue(np.array_equal(x.tanh().numpy(), - paddle.tanh(x).numpy())) - self.assertTrue(np.array_equal(x.atan().numpy(), - paddle.atan(x).numpy())) - self.assertTrue(np.array_equal(x.abs().numpy(), paddle.abs(x).numpy())) + np.testing.assert_array_equal(x.exp().numpy(), paddle.exp(x).numpy()) + np.testing.assert_array_equal(x.tanh().numpy(), paddle.tanh(x).numpy()) + np.testing.assert_array_equal(x.atan().numpy(), paddle.atan(x).numpy()) + np.testing.assert_array_equal(x.abs().numpy(), paddle.abs(x).numpy()) m = x.abs() - self.assertTrue(np.array_equal(m.sqrt().numpy(), - paddle.sqrt(m).numpy())) - self.assertTrue( - np.array_equal(m.rsqrt().numpy(), - paddle.rsqrt(m).numpy())) - self.assertTrue(np.array_equal(x.ceil().numpy(), - paddle.ceil(x).numpy())) - self.assertTrue( - np.array_equal(x.floor().numpy(), - paddle.floor(x).numpy())) - self.assertTrue(np.array_equal(x.cos().numpy(), paddle.cos(x).numpy())) - self.assertTrue(np.array_equal(x.acos().numpy(), - paddle.acos(x).numpy())) - self.assertTrue(np.array_equal(x.asin().numpy(), - paddle.asin(x).numpy())) - self.assertTrue(np.array_equal(x.sin().numpy(), paddle.sin(x).numpy())) - self.assertTrue(np.array_equal(x.sinh().numpy(), - paddle.sinh(x).numpy())) - self.assertTrue(np.array_equal(x.cosh().numpy(), - paddle.cosh(x).numpy())) - self.assertTrue( - np.array_equal(x.round().numpy(), - paddle.round(x).numpy())) - self.assertTrue( - np.array_equal(x.reciprocal().numpy(), - paddle.reciprocal(x).numpy())) - self.assertTrue( - np.array_equal(x.square().numpy(), - paddle.square(x).numpy())) - self.assertTrue(np.array_equal(x.rank().numpy(), - paddle.rank(x).numpy())) - self.assertTrue(np.array_equal(x[0].t().numpy(), - paddle.t(x[0]).numpy())) - self.assertTrue( - np.array_equal(x.asinh().numpy(), - paddle.asinh(x).numpy())) + np.testing.assert_array_equal(m.sqrt().numpy(), paddle.sqrt(m).numpy()) + np.testing.assert_array_equal(m.rsqrt().numpy(), + paddle.rsqrt(m).numpy()) + np.testing.assert_array_equal(x.ceil().numpy(), paddle.ceil(x).numpy()) + np.testing.assert_array_equal(x.floor().numpy(), + paddle.floor(x).numpy()) + np.testing.assert_array_equal(x.cos().numpy(), paddle.cos(x).numpy()) + np.testing.assert_array_equal(x.acos().numpy(), paddle.acos(x).numpy()) + np.testing.assert_array_equal(x.asin().numpy(), paddle.asin(x).numpy()) + np.testing.assert_array_equal(x.sin().numpy(), paddle.sin(x).numpy()) + np.testing.assert_array_equal(x.sinh().numpy(), paddle.sinh(x).numpy()) + np.testing.assert_array_equal(x.cosh().numpy(), paddle.cosh(x).numpy()) + np.testing.assert_array_equal(x.round().numpy(), + paddle.round(x).numpy()) + np.testing.assert_array_equal(x.reciprocal().numpy(), + paddle.reciprocal(x).numpy()) + np.testing.assert_array_equal(x.square().numpy(), + paddle.square(x).numpy()) + np.testing.assert_array_equal(x.rank().numpy(), paddle.rank(x).numpy()) + np.testing.assert_array_equal(x[0].t().numpy(), paddle.t(x[0]).numpy()) + np.testing.assert_array_equal(x.asinh().numpy(), + paddle.asinh(x).numpy()) ### acosh(x) = nan, need to change input t_np = np.random.uniform(1, 2, [2, 3]).astype(self.dtype) t = paddle.to_tensor(t_np) - self.assertTrue( - np.array_equal(t.acosh().numpy(), - paddle.acosh(t).numpy())) - self.assertTrue( - np.array_equal(x.atanh().numpy(), - paddle.atanh(x).numpy())) + np.testing.assert_array_equal(t.acosh().numpy(), + paddle.acosh(t).numpy()) + np.testing.assert_array_equal(x.atanh().numpy(), + paddle.atanh(x).numpy()) d = paddle.to_tensor([[1.2285208, 1.3491015, 1.4899898], [1.30058, 1.0688717, 1.4928783], [1.0958099, 1.3724753, 1.8926544]]) d = d.matmul(d.t()) # ROCM not support cholesky if not fluid.core.is_compiled_with_rocm(): - self.assertTrue( - np.array_equal(d.cholesky().numpy(), - paddle.cholesky(d).numpy())) - - self.assertTrue( - np.array_equal(x.is_empty().numpy(), - paddle.is_empty(x).numpy())) - self.assertTrue( - np.array_equal(x.isfinite().numpy(), - paddle.isfinite(x).numpy())) - self.assertTrue( - np.array_equal( - x.cast('int32').numpy(), - paddle.cast(x, 'int32').numpy())) - self.assertTrue( - np.array_equal( - x.expand([3, 2, 3]).numpy(), - paddle.expand(x, [3, 2, 3]).numpy())) - self.assertTrue( - np.array_equal( - x.tile([2, 2]).numpy(), - paddle.tile(x, [2, 2]).numpy())) - self.assertTrue( - np.array_equal(x.flatten().numpy(), - paddle.flatten(x).numpy())) + np.testing.assert_array_equal(d.cholesky().numpy(), + paddle.cholesky(d).numpy()) + + np.testing.assert_array_equal(x.is_empty().numpy(), + paddle.is_empty(x).numpy()) + np.testing.assert_array_equal(x.isfinite().numpy(), + paddle.isfinite(x).numpy()) + np.testing.assert_array_equal( + x.cast('int32').numpy(), + paddle.cast(x, 'int32').numpy()) + np.testing.assert_array_equal( + x.expand([3, 2, 3]).numpy(), + paddle.expand(x, [3, 2, 3]).numpy()) + np.testing.assert_array_equal( + x.tile([2, 2]).numpy(), + paddle.tile(x, [2, 2]).numpy()) + np.testing.assert_array_equal(x.flatten().numpy(), + paddle.flatten(x).numpy()) index = paddle.to_tensor([0, 1]) - self.assertTrue( - np.array_equal( - x.gather(index).numpy(), - paddle.gather(x, index).numpy())) + np.testing.assert_array_equal( + x.gather(index).numpy(), + paddle.gather(x, index).numpy()) index = paddle.to_tensor([[0, 1], [1, 2]]) - self.assertTrue( - np.array_equal( - x.gather_nd(index).numpy(), - paddle.gather_nd(x, index).numpy())) - self.assertTrue( - np.array_equal( - x.reverse([0, 1]).numpy(), - paddle.reverse(x, [0, 1]).numpy())) - self.assertTrue( - np.array_equal( - a.reshape([3, 2]).numpy(), - paddle.reshape(a, [3, 2]).numpy())) - self.assertTrue( - np.array_equal( - x.slice([0, 1], [0, 0], [1, 2]).numpy(), - paddle.slice(x, [0, 1], [0, 0], [1, 2]).numpy())) - self.assertTrue( - np.array_equal( - x.split(2)[0].numpy(), - paddle.split(x, 2)[0].numpy())) + np.testing.assert_array_equal( + x.gather_nd(index).numpy(), + paddle.gather_nd(x, index).numpy()) + np.testing.assert_array_equal( + x.reverse([0, 1]).numpy(), + paddle.reverse(x, [0, 1]).numpy()) + np.testing.assert_array_equal( + a.reshape([3, 2]).numpy(), + paddle.reshape(a, [3, 2]).numpy()) + np.testing.assert_array_equal( + x.slice([0, 1], [0, 0], [1, 2]).numpy(), + paddle.slice(x, [0, 1], [0, 0], [1, 2]).numpy()) + np.testing.assert_array_equal( + x.split(2)[0].numpy(), + paddle.split(x, 2)[0].numpy()) m = paddle.to_tensor( np.random.uniform(-1, 1, [1, 6, 1, 1]).astype(self.dtype)) - self.assertTrue( - np.array_equal( - m.squeeze([]).numpy(), - paddle.squeeze(m, []).numpy())) - self.assertTrue( - np.array_equal( - m.squeeze([1, 2]).numpy(), - paddle.squeeze(m, [1, 2]).numpy())) + np.testing.assert_array_equal( + m.squeeze([]).numpy(), + paddle.squeeze(m, []).numpy()) + np.testing.assert_array_equal( + m.squeeze([1, 2]).numpy(), + paddle.squeeze(m, [1, 2]).numpy()) m = paddle.to_tensor([2, 3, 3, 1, 5, 3], 'float32') - self.assertTrue( - np.array_equal(m.unique()[0].numpy(), - paddle.unique(m)[0].numpy())) - self.assertTrue( - np.array_equal( - m.unique(return_counts=True)[1], - paddle.unique(m, return_counts=True)[1])) - self.assertTrue(np.array_equal(x.flip([0]), paddle.flip(x, [0]))) - self.assertTrue(np.array_equal(x.unbind(0), paddle.unbind(x, 0))) - self.assertTrue(np.array_equal(x.roll(1), paddle.roll(x, 1))) - self.assertTrue(np.array_equal(x.cumsum(1), paddle.cumsum(x, 1))) + np.testing.assert_array_equal(m.unique()[0].numpy(), + paddle.unique(m)[0].numpy()) + np.testing.assert_array_equal( + m.unique(return_counts=True)[1], + paddle.unique(m, return_counts=True)[1]) + np.testing.assert_array_equal(x.flip([0]), paddle.flip(x, [0])) + np.testing.assert_array_equal(x.unbind(0), paddle.unbind(x, 0)) + np.testing.assert_array_equal(x.roll(1), paddle.roll(x, 1)) + np.testing.assert_array_equal(x.cumsum(1), paddle.cumsum(x, 1)) m = paddle.to_tensor(1) - self.assertTrue(np.array_equal(m.increment(), paddle.increment(m))) + np.testing.assert_array_equal(m.increment(), paddle.increment(m)) m = x.abs() - self.assertTrue(np.array_equal(m.log(), paddle.log(m))) - self.assertTrue(np.array_equal(x.pow(2), paddle.pow(x, 2))) - self.assertTrue(np.array_equal(x.reciprocal(), paddle.reciprocal(x))) + np.testing.assert_array_equal(m.log(), paddle.log(m)) + np.testing.assert_array_equal(x.pow(2), paddle.pow(x, 2)) + np.testing.assert_array_equal(x.reciprocal(), paddle.reciprocal(x)) # 2. Binary operation - self.assertTrue( - np.array_equal(x.divide(y).numpy(), - paddle.divide(x, y).numpy())) - self.assertTrue( - np.array_equal( - x.matmul(y, True, False).numpy(), - paddle.matmul(x, y, True, False).numpy())) - self.assertTrue( - np.array_equal( - x.norm(p='fro', axis=[0, 1]).numpy(), - paddle.norm(x, p='fro', axis=[0, 1]).numpy())) - self.assertTrue( - np.array_equal(x.dist(y).numpy(), - paddle.dist(x, y).numpy())) - self.assertTrue( - np.array_equal(x.cross(y).numpy(), - paddle.cross(x, y).numpy())) + np.testing.assert_array_equal( + x.divide(y).numpy(), + paddle.divide(x, y).numpy()) + np.testing.assert_array_equal( + x.matmul(y, True, False).numpy(), + paddle.matmul(x, y, True, False).numpy()) + np.testing.assert_array_equal( + x.norm(p='fro', axis=[0, 1]).numpy(), + paddle.norm(x, p='fro', axis=[0, 1]).numpy()) + np.testing.assert_array_equal( + x.dist(y).numpy(), + paddle.dist(x, y).numpy()) + np.testing.assert_array_equal( + x.cross(y).numpy(), + paddle.cross(x, y).numpy()) m = x.expand([2, 2, 3]) n = y.expand([2, 2, 3]).transpose([0, 2, 1]) - self.assertTrue( - np.array_equal(m.bmm(n).numpy(), - paddle.bmm(m, n).numpy())) - self.assertTrue( - np.array_equal( - x.histogram(5, -1, 1).numpy(), - paddle.histogram(x, 5, -1, 1).numpy())) - self.assertTrue( - np.array_equal(x.equal(y).numpy(), - paddle.equal(x, y).numpy())) - self.assertTrue( - np.array_equal( - x.greater_equal(y).numpy(), - paddle.greater_equal(x, y).numpy())) - self.assertTrue( - np.array_equal( - x.greater_than(y).numpy(), - paddle.greater_than(x, y).numpy())) - self.assertTrue( - np.array_equal( - x.less_equal(y).numpy(), - paddle.less_equal(x, y).numpy())) - self.assertTrue( - np.array_equal( - x.less_than(y).numpy(), - paddle.less_than(x, y).numpy())) - self.assertTrue( - np.array_equal( - x.not_equal(y).numpy(), - paddle.not_equal(x, y).numpy())) - self.assertTrue( - np.array_equal( - x.equal_all(y).numpy(), - paddle.equal_all(x, y).numpy())) - self.assertTrue( - np.array_equal( - x.allclose(y).numpy(), - paddle.allclose(x, y).numpy())) + np.testing.assert_array_equal( + m.bmm(n).numpy(), + paddle.bmm(m, n).numpy()) + np.testing.assert_array_equal( + x.histogram(5, -1, 1).numpy(), + paddle.histogram(x, 5, -1, 1).numpy()) + np.testing.assert_array_equal( + x.equal(y).numpy(), + paddle.equal(x, y).numpy()) + np.testing.assert_array_equal( + x.greater_equal(y).numpy(), + paddle.greater_equal(x, y).numpy()) + np.testing.assert_array_equal( + x.greater_than(y).numpy(), + paddle.greater_than(x, y).numpy()) + np.testing.assert_array_equal( + x.less_equal(y).numpy(), + paddle.less_equal(x, y).numpy()) + np.testing.assert_array_equal( + x.less_than(y).numpy(), + paddle.less_than(x, y).numpy()) + np.testing.assert_array_equal( + x.not_equal(y).numpy(), + paddle.not_equal(x, y).numpy()) + np.testing.assert_array_equal( + x.equal_all(y).numpy(), + paddle.equal_all(x, y).numpy()) + np.testing.assert_array_equal( + x.allclose(y).numpy(), + paddle.allclose(x, y).numpy()) m = x.expand([2, 2, 3]) - self.assertTrue( - np.array_equal( - x.expand_as(m).numpy(), - paddle.expand_as(x, m).numpy())) + np.testing.assert_array_equal( + x.expand_as(m).numpy(), + paddle.expand_as(x, m).numpy()) index = paddle.to_tensor([2, 1, 0]) - self.assertTrue( - np.array_equal( - a.scatter(index, b).numpy(), - paddle.scatter(a, index, b).numpy())) + np.testing.assert_array_equal( + a.scatter(index, b).numpy(), + paddle.scatter(a, index, b).numpy()) # 3. Bool tensor operation x = paddle.to_tensor([[True, False], [True, False]]) y = paddle.to_tensor([[False, False], [False, True]]) - self.assertTrue( - np.array_equal( - x.logical_and(y).numpy(), - paddle.logical_and(x, y).numpy())) - self.assertTrue( - np.array_equal( - x.logical_not(y).numpy(), - paddle.logical_not(x, y).numpy())) - self.assertTrue( - np.array_equal( - x.logical_or(y).numpy(), - paddle.logical_or(x, y).numpy())) - self.assertTrue( - np.array_equal( - x.logical_xor(y).numpy(), - paddle.logical_xor(x, y).numpy())) - self.assertTrue( - np.array_equal( - x.logical_and(y).numpy(), - paddle.logical_and(x, y).numpy())) + np.testing.assert_array_equal( + x.logical_and(y).numpy(), + paddle.logical_and(x, y).numpy()) + np.testing.assert_array_equal( + x.logical_not(y).numpy(), + paddle.logical_not(x, y).numpy()) + np.testing.assert_array_equal( + x.logical_or(y).numpy(), + paddle.logical_or(x, y).numpy()) + np.testing.assert_array_equal( + x.logical_xor(y).numpy(), + paddle.logical_xor(x, y).numpy()) + np.testing.assert_array_equal( + x.logical_and(y).numpy(), + paddle.logical_and(x, y).numpy()) a = paddle.to_tensor([[1, 2], [3, 4]]) b = paddle.to_tensor([[4, 3], [2, 1]]) - self.assertTrue( - np.array_equal( - x.where(a, b).numpy(), - paddle.where(x, a, b).numpy())) + np.testing.assert_array_equal( + x.where(a, b).numpy(), + paddle.where(x, a, b).numpy()) x_np = np.random.randn(3, 6, 9, 7) x = paddle.to_tensor(x_np) x_T = x.T self.assertTrue(x_T.shape, [7, 9, 6, 3]) - self.assertTrue(np.array_equal(x_T.numpy(), x_np.T)) + np.testing.assert_array_equal(x_T.numpy(), x_np.T) self.assertTrue(inspect.ismethod(a.dot)) self.assertTrue(inspect.ismethod(a.logsumexp)) @@ -781,7 +728,7 @@ def func_test_complex_scalar(self): with fluid.dygraph.guard(): a = fluid.dygraph.to_variable(a_np) res = 1J * a - self.assertTrue(np.array_equal(res.numpy(), 1J * a_np)) + np.testing.assert_array_equal(res.numpy(), 1j * a_np) def test_complex_scalar(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py index e6481e12f1e1e..d069ebff1dc4b 100644 --- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py @@ -523,7 +523,7 @@ def test_compute_type_fp32(self): result_np = np.matmul(input_x, input_y) self.assertTrue(paddle.isfinite(result)[0, 0, 0]) self.assertTrue(np.isfinite(result_np)[0, 0, 0]) - self.assertTrue(np.array_equal(result_np, result.numpy())) + np.testing.assert_array_equal(result_np, result.numpy()) paddle.set_flags( {'FLAGS_gemm_use_half_precision_compute_type': True}) diff --git a/python/paddle/fluid/tests/unittests/test_max_min_amax_amin_op.py b/python/paddle/fluid/tests/unittests/test_max_min_amax_amin_op.py index cadbca93ad3f2..608fad131f5b5 100644 --- a/python/paddle/fluid/tests/unittests/test_max_min_amax_amin_op.py +++ b/python/paddle/fluid/tests/unittests/test_max_min_amax_amin_op.py @@ -187,3 +187,7 @@ def init_case(self): self.dtype = 'float64' self.axis = None self.keepdim = False + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py index 6b7a47febb835..a3608b5aa5e6f 100644 --- a/python/paddle/fluid/tests/unittests/test_mean_op.py +++ b/python/paddle/fluid/tests/unittests/test_mean_op.py @@ -100,7 +100,7 @@ def test_checkout_grad(self): dx = paddle.grad(y, x)[0].numpy() dx_expected = self.dtype(1.0 / np.prod(x_np.shape)) * np.ones( x_np.shape).astype(self.dtype) - self.assertTrue(np.array_equal(dx, dx_expected)) + np.testing.assert_array_equal(dx, dx_expected) @OpTestTool.skip_if_not_cpu_bf16() @@ -193,7 +193,7 @@ def test_check_grad(self): dx_expected = ref_reduce_mean_grad(self.inputs['X'], self.attrs['dim'], self.dtype) - self.assertTrue(np.array_equal(dx, dx_expected)) + np.testing.assert_array_equal(dx, dx_expected) class TestReduceMeanOpDefaultAttrs(TestReduceMeanOp): diff --git a/python/paddle/fluid/tests/unittests/test_memcpy_op.py b/python/paddle/fluid/tests/unittests/test_memcpy_op.py index a1469ca558be0..7a925b10036d2 100755 --- a/python/paddle/fluid/tests/unittests/test_memcpy_op.py +++ b/python/paddle/fluid/tests/unittests/test_memcpy_op.py @@ -139,7 +139,7 @@ def test_hip_copy_bool_value(self): feed={}, fetch_list=[gpu_var.name, pinned_var.name]) expect_value = np.array([1]).astype('bool') - self.assertTrue(np.array_equal(gpu_, expect_value)) + np.testing.assert_array_equal(gpu_, expect_value) else: pass @@ -201,7 +201,7 @@ def test_api(self): a = paddle.ones([1024, 1024]) b = paddle.tensor.creation._memcpy(a, paddle.CUDAPinnedPlace()) self.assertEqual(b.place.__repr__(), "Place(gpu_pinned)") - self.assertTrue(np.array_equal(a.numpy(), b.numpy())) + np.testing.assert_array_equal(a.numpy(), b.numpy()) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_memory_reuse_exclude_feed_var.py b/python/paddle/fluid/tests/unittests/test_memory_reuse_exclude_feed_var.py index 98550ac5018ab..7b335bf83d6ce 100644 --- a/python/paddle/fluid/tests/unittests/test_memory_reuse_exclude_feed_var.py +++ b/python/paddle/fluid/tests/unittests/test_memory_reuse_exclude_feed_var.py @@ -50,7 +50,7 @@ def main_impl(self, place): for _ in range(self.iteration): exe.run(compiled_prog, feed=feed_dict, fetch_list=[loss.name]) - self.assertTrue(np.array_equal(np.array(image_tensor), np_image)) + np.testing.assert_array_equal(np.array(image_tensor), np_image) def test_main(self): places = [fluid.CPUPlace()] diff --git a/python/paddle/fluid/tests/unittests/test_merged_adam_op.py b/python/paddle/fluid/tests/unittests/test_merged_adam_op.py index 02cadf0230071..1038d0db4f60a 100644 --- a/python/paddle/fluid/tests/unittests/test_merged_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_merged_adam_op.py @@ -135,7 +135,7 @@ def run_op(use_merged): value2 = outs2[key] for i in range(len(value1)): if place == 'gpu': - self.assertTrue(np.array_equal(value1[i], value2[i])) + np.testing.assert_array_equal(value1[i], value2[i]) else: self.assertTrue(np.allclose(value1[i], value2[i], atol=1e-7)) diff --git a/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py b/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py index 4afdc267de5cb..957b9e45e0ca6 100644 --- a/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py +++ b/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py @@ -314,7 +314,7 @@ def run_op(use_merged): self.assertEqual(len(outs1), len(outs2)) for i, (out1, out2) in enumerate(zip(outs1, outs2)): if isinstance(place, paddle.CUDAPlace): - self.assertTrue(np.array_equal(out1, out2)) + np.testing.assert_array_equal(out1, out2) else: self.assertTrue(np.allclose(out1, out2, atol=1e-7)) @@ -378,7 +378,7 @@ def run_op(use_nesterov, use_merged): self.assertEqual(len(outs1), len(outs2)) for i, (out1, out2) in enumerate(zip(outs1, outs2)): if isinstance(place, paddle.CUDAPlace): - self.assertTrue(np.array_equal(out1, out2)) + np.testing.assert_array_equal(out1, out2) else: self.assertTrue(np.allclose(out1, out2, atol=1e-7)) @@ -387,7 +387,7 @@ def run_op(use_nesterov, use_merged): self.assertEqual(len(outs3), len(outs4)) for j, (out3, out4) in enumerate(zip(outs3, outs4)): if isinstance(place, paddle.CUDAPlace): - self.assertTrue(np.array_equal(out3, out4)) + np.testing.assert_array_equal(out3, out4) else: self.assertTrue(np.allclose(out3, out4, atol=1e-7)) diff --git a/python/paddle/fluid/tests/unittests/test_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_mixed_precision.py index 68dfb88ccd05f..5fdc137f74470 100644 --- a/python/paddle/fluid/tests/unittests/test_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/test_mixed_precision.py @@ -117,9 +117,9 @@ def test_skip_update(self): found_inf) if i % 2: self.assertTrue(found_inf) - self.assertTrue(np.array_equal(weight_, pre_weight_)) - self.assertTrue(np.array_equal(moment1_, pre_moment1_)) - self.assertTrue(np.array_equal(beta_pow1_, pre_beta_pow1_)) + np.testing.assert_array_equal(weight_, pre_weight_) + np.testing.assert_array_equal(moment1_, pre_moment1_) + np.testing.assert_array_equal(beta_pow1_, pre_beta_pow1_) else: self.assertFalse(found_inf) self.assertFalse(np.array_equal(weight_, pre_weight_)) diff --git a/python/paddle/fluid/tests/unittests/test_mse_loss.py b/python/paddle/fluid/tests/unittests/test_mse_loss.py index b32833916e2b1..d3dd0d277455e 100644 --- a/python/paddle/fluid/tests/unittests/test_mse_loss.py +++ b/python/paddle/fluid/tests/unittests/test_mse_loss.py @@ -316,4 +316,5 @@ def test_NNFunctionalMseLoss_none(self): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_multinomial_op.py b/python/paddle/fluid/tests/unittests/test_multinomial_op.py index b60a46d66adb9..2233189285a5b 100644 --- a/python/paddle/fluid/tests/unittests/test_multinomial_op.py +++ b/python/paddle/fluid/tests/unittests/test_multinomial_op.py @@ -251,33 +251,33 @@ def test_fixed_random_number(self): self.assertEqual(np.sum(y), 5187793) self.assertEqual(np.mean(y), 5066.2041015625) expect = [9982, 1655, 4741, 1323, 9319, 3298, 6473, 7477, 2507, 2628] - self.assertTrue(np.array_equal(y[100:110, :].flatten(), expect)) + np.testing.assert_array_equal(y[100:110, :].flatten(), expect) y = paddle.multinomial(x, 5000, replacement=False).numpy() self.assertEqual(np.sum(y), 25603962316) self.assertEqual(np.mean(y), 5000.77388984375) expect = [7300, 6055, 8714, 5401, 7360, 161, 5035, 7002, 6788, 2916] - self.assertTrue(np.array_equal(y[100, 1000:1010], expect)) + np.testing.assert_array_equal(y[100, 1000:1010], expect) y = paddle.multinomial(x, 5000, replacement=False).numpy() self.assertEqual(np.sum(y), 25592855710) self.assertEqual(np.mean(y), 4998.604630859375) expect = [5700, 6567, 4399, 5688, 7472, 545, 6894, 526, 2124, 385] - self.assertTrue(np.array_equal(y[300, 3000:3010], expect)) + np.testing.assert_array_equal(y[300, 3000:3010], expect) y = paddle.multinomial(x, 20000, replacement=True).numpy() self.assertEqual(np.sum(y), 102371362581) self.assertEqual(np.mean(y), 4998.60168852539) self.assertEqual(np.std(y), 2886.316308500771) expect = [7630, 8235, 8445, 3275, 5580, 4591, 1331, 342, 1662, 7156] - self.assertTrue(np.array_equal(y[100, 0:10], expect)) + np.testing.assert_array_equal(y[100, 0:10], expect) y = paddle.multinomial(x, 20000, replacement=True).numpy() self.assertEqual(np.sum(y), 102400672117) self.assertEqual(np.mean(y), 5000.032818212891) self.assertEqual(np.std(y), 2886.913426124017) expect = [4159, 7849, 9305, 5759, 4422, 122, 345, 2897, 5200, 5911] - self.assertTrue(np.array_equal(y[100, 0:10], expect)) + np.testing.assert_array_equal(y[100, 0:10], expect) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_op_function_generator.py b/python/paddle/fluid/tests/unittests/test_op_function_generator.py index e97895cf8bbe1..82fe1e1781d7e 100644 --- a/python/paddle/fluid/tests/unittests/test_op_function_generator.py +++ b/python/paddle/fluid/tests/unittests/test_op_function_generator.py @@ -51,7 +51,7 @@ def test_elementwise_add(self): res1 = layers.elementwise_add(x, y) res2 = _C_ops.elementwise_add(x, y) - self.assertTrue(np.array_equal(res1.numpy(), res2.numpy())) + np.testing.assert_array_equal(res1.numpy(), res2.numpy()) def test_elementwise_mul(self): with fluid.dygraph.guard(): @@ -63,7 +63,7 @@ def test_elementwise_mul(self): res1 = layers.elementwise_mul(x, y) res2 = _C_ops.elementwise_mul(x, y) - self.assertTrue(np.array_equal(res1.numpy(), res2.numpy())) + np.testing.assert_array_equal(res1.numpy(), res2.numpy()) def test_relu(self): with fluid.dygraph.guard(): @@ -73,7 +73,7 @@ def test_relu(self): res1 = layers.relu(x) res2 = _C_ops.relu(x) - self.assertTrue(np.array_equal(res1.numpy(), res2.numpy())) + np.testing.assert_array_equal(res1.numpy(), res2.numpy()) def test_trace_backward(self): fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) @@ -91,8 +91,8 @@ def test_trace_backward(self): x_grad = x.gradient() y_grad = y.gradient() - self.assertTrue(np.array_equal(x_grad, loss.gradient() * b)) - self.assertTrue(np.array_equal(y_grad, loss.gradient() * a)) + np.testing.assert_array_equal(x_grad, loss.gradient() * b) + np.testing.assert_array_equal(y_grad, loss.gradient() * a) fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) def test_traced_layer(self): @@ -106,8 +106,7 @@ def test_traced_layer(self): layer, inputs=x) # dygraph out res_static_graph = static_layer([x])[0] - self.assertTrue( - np.array_equal(res_dygraph.numpy(), res_static_graph)) + np.testing.assert_array_equal(res_dygraph.numpy(), res_static_graph) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_ops_nms.py b/python/paddle/fluid/tests/unittests/test_ops_nms.py index 3d6f2b717f261..4004f29b446a3 100644 --- a/python/paddle/fluid/tests/unittests/test_ops_nms.py +++ b/python/paddle/fluid/tests/unittests/test_ops_nms.py @@ -100,9 +100,10 @@ def test_nms(self): self.threshold) out_py = nms(boxes, self.threshold) - self.assertTrue( - np.array_equal(out.numpy(), out_py), - "paddle out: {}\n py out: {}\n".format(out, out_py)) + np.testing.assert_array_equal( + out.numpy(), + out_py, + err_msg='paddle out: {}\n py out: {}\n'.format(out, out_py)) def test_multiclass_nms_dynamic(self): for device in self.devices: @@ -118,9 +119,10 @@ def test_multiclass_nms_dynamic(self): out_py = multiclass_nms(boxes, scores, category_idxs, self.threshold, self.topk) - self.assertTrue( - np.array_equal(out.numpy(), out_py), - "paddle out: {}\n py out: {}\n".format(out, out_py)) + np.testing.assert_array_equal( + out.numpy(), + out_py, + err_msg='paddle out: {}\n py out: {}\n'.format(out, out_py)) def test_multiclass_nms_static(self): for device in self.devices: @@ -157,9 +159,10 @@ def test_multiclass_nms_static(self): self.threshold, self.topk) out = np.array(out) out = np.squeeze(out) - self.assertTrue( - np.array_equal(out, out_py), - "paddle out: {}\n py out: {}\n".format(out, out_py)) + np.testing.assert_array_equal( + out, + out_py, + err_msg='paddle out: {}\n py out: {}\n'.format(out, out_py)) def test_multiclass_nms_dynamic_to_static(self): for device in self.devices: @@ -192,9 +195,10 @@ def fun(x): ) load_func = paddle.jit.load(self.path) res = load_func(paddle.to_tensor(boxes)) - self.assertTrue( - np.array_equal(origin, res), - "origin out: {}\n inference model out: {}\n".format( + np.testing.assert_array_equal( + origin, + res, + err_msg='origin out: {}\n inference model out: {}\n'.format( origin, res)) def test_matrix_nms_dynamic(self): diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py index 490167a8ff796..df470bc9ec1cb 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer.py @@ -1362,8 +1362,7 @@ def test_with_state_dict(self): use_save_load=True) out_no_state_dict = self.check_with_opt_state_dict( use_save_load=False) - self.assertTrue( - np.array_equal(out_use_state_dict, out_no_state_dict)) + np.testing.assert_array_equal(out_use_state_dict, out_no_state_dict) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_pad_op.py b/python/paddle/fluid/tests/unittests/test_pad_op.py index 30044fec755a3..15b33bc97006b 100644 --- a/python/paddle/fluid/tests/unittests/test_pad_op.py +++ b/python/paddle/fluid/tests/unittests/test_pad_op.py @@ -17,6 +17,7 @@ import unittest import numpy as np from op_test import OpTest +import paddle import paddle.fluid.core as core import paddle.fluid as fluid from paddle.fluid import Program, program_guard diff --git a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py index 43d4276905390..0faeaec53d27e 100644 --- a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py @@ -194,15 +194,13 @@ def func_none_one_initial_gradient(self): if grad_y is not None: self.assertTrue(grad_y.stop_gradient) - self.assertTrue( - np.array_equal(grad_y.numpy(), - original_random_grad_y)) + np.testing.assert_array_equal(grad_y.numpy(), + original_random_grad_y) if grad_z is not None: self.assertTrue(grad_z.stop_gradient) - self.assertTrue( - np.array_equal(grad_z.numpy(), - original_random_grad_z)) + np.testing.assert_array_equal(grad_z.numpy(), + original_random_grad_z) def test_none_one_initial_gradient(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py index d3052b719ae2d..44ec5a0aa6ddd 100644 --- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py @@ -117,7 +117,7 @@ def test_large_parameters_paddle_save(self): dict_load = paddle.load(path, return_numpy=True) # compare results before and after saving for key, value in save_dict.items(): - self.assertTrue(np.array_equal(dict_load[key], value.numpy())) + np.testing.assert_array_equal(dict_load[key], value.numpy()) class TestSaveLoadPickle(unittest.TestCase): @@ -158,8 +158,8 @@ def test_pickle_protocol(self): dict_load = paddle.load(path) # compare results before and after saving for key, value in save_dict.items(): - self.assertTrue( - np.array_equal(dict_load[key].numpy(), value.numpy())) + np.testing.assert_array_equal(dict_load[key].numpy(), + value.numpy()) class TestSaveLoadAny(unittest.TestCase): @@ -242,7 +242,7 @@ def test_replace_static_save_load(self): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, np.array(base_t))) + np.testing.assert_array_equal(new_t, np.array(base_t)) # legacy paddle.fluid.save, paddle.load paddle.fluid.io.save(prog, path) self.set_zero(prog, place) @@ -252,7 +252,7 @@ def test_replace_static_save_load(self): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) # test for return tensor path_vars = 'test_replace_save_load_return_tensor_static/model' for var in prog.list_vars(): @@ -285,7 +285,7 @@ def test_replace_static_save_load(self): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) def test_paddle_save_load_v2(self): paddle.disable_static() @@ -329,12 +329,12 @@ def get_lr(self): if isinstance(v, dict): self.assertTrue(v == load_dict_tensor[k]) else: - self.assertTrue( - np.array_equal(v.numpy(), load_dict_tensor[k].numpy())) + np.testing.assert_array_equal(v.numpy(), + load_dict_tensor[k].numpy()) if not np.array_equal(v.numpy(), load_dict_np[k]): print(v.numpy()) print(load_dict_np[k]) - self.assertTrue(np.array_equal(v.numpy(), load_dict_np[k])) + np.testing.assert_array_equal(v.numpy(), load_dict_np[k]) def test_single_pickle_var_dygraph(self): # enable dygraph mode @@ -354,14 +354,14 @@ def test_single_pickle_var_dygraph(self): isinstance( t_dygraph, (paddle.fluid.core.VarBase, paddle.fluid.core.eager.Tensor))) - self.assertTrue(np.array_equal(tensor.numpy(), np_dygraph)) - self.assertTrue(np.array_equal(tensor.numpy(), t_dygraph.numpy())) + np.testing.assert_array_equal(tensor.numpy(), np_dygraph) + np.testing.assert_array_equal(tensor.numpy(), t_dygraph.numpy()) paddle.enable_static() lod_static = paddle.load(path) np_static = paddle.load(path, return_numpy=True) self.assertTrue(isinstance(lod_static, paddle.fluid.core.LoDTensor)) - self.assertTrue(np.array_equal(tensor.numpy(), np_static)) - self.assertTrue(np.array_equal(tensor.numpy(), np.array(lod_static))) + np.testing.assert_array_equal(tensor.numpy(), np_static) + np.testing.assert_array_equal(tensor.numpy(), np.array(lod_static)) def test_single_pickle_var_static(self): # enable static mode @@ -394,17 +394,17 @@ def test_single_pickle_var_static(self): np_static = paddle.load(path, return_numpy=True) # set_tensor(np.ndarray) var.set_value(np_static, scope) - self.assertTrue(np.array_equal(origin_tensor, np.array(tensor))) + np.testing.assert_array_equal(origin_tensor, np.array(tensor)) # set_tensor(LoDTensor) self.set_zero(prog, place, scope) var.set_value(lod_static, scope) - self.assertTrue(np.array_equal(origin_tensor, np.array(tensor))) + np.testing.assert_array_equal(origin_tensor, np.array(tensor)) # enable dygraph mode paddle.disable_static() var_dygraph = paddle.load(path) np_dygraph = paddle.load(path, return_numpy=True) - self.assertTrue(np.array_equal(np.array(tensor), np_dygraph)) - self.assertTrue(np.array_equal(np.array(tensor), var_dygraph.numpy())) + np.testing.assert_array_equal(np.array(tensor), np_dygraph) + np.testing.assert_array_equal(np.array(tensor), var_dygraph.numpy()) def test_dygraph_save_static_load(self): inps = np.random.randn(1, IMAGE_SIZE).astype('float32') @@ -432,9 +432,8 @@ def test_dygraph_save_static_load(self): program.set_state_dict(state_dict) state_dict_param = program.state_dict("param") for name, tensor in state_dict_dy.items(): - self.assertTrue( - np.array_equal(tensor.numpy(), - np.array(state_dict_param[tensor.name]))) + np.testing.assert_array_equal( + tensor.numpy(), np.array(state_dict_param[tensor.name])) def test_save_load_complex_object_dygraph_save(self): paddle.disable_static() @@ -471,58 +470,55 @@ def test_save_load_complex_object_dygraph_save(self): load_tensor3 = paddle.load(path3, return_numpy=False) load_tensor4 = paddle.load(path4, return_numpy=False) - self.assertTrue(np.array_equal(load_tensor1[0].numpy(), - obj1[0].numpy())) - self.assertTrue(np.array_equal(load_tensor1[1], obj1[1])) - self.assertTrue(np.array_equal(load_tensor1[2].numpy(), obj1[2][1])) + np.testing.assert_array_equal(load_tensor1[0].numpy(), obj1[0].numpy()) + np.testing.assert_array_equal(load_tensor1[1], obj1[1]) + np.testing.assert_array_equal(load_tensor1[2].numpy(), obj1[2][1]) for i in range(len(load_tensor1)): self.assertTrue( type(load_tensor1[i]) == type(load_tensor2['k1'][i])) for k, v in state_dict.items(): - self.assertTrue( - np.array_equal(v.numpy(), load_tensor2['k2'][k].numpy())) + np.testing.assert_array_equal(v.numpy(), + load_tensor2['k2'][k].numpy()) self.assertTrue(load_tensor2['epoch'] == 123) - self.assertTrue(np.array_equal(load_tensor3[0].numpy(), - obj3[0].numpy())) - self.assertTrue(np.array_equal(np.array(load_tensor3[1]), obj3[1])) + np.testing.assert_array_equal(load_tensor3[0].numpy(), obj3[0].numpy()) + np.testing.assert_array_equal(np.array(load_tensor3[1]), obj3[1]) for k, v in state_dict.items(): - self.assertTrue( - np.array_equal(load_tensor3[2]["state_dict"][k].numpy(), - v.numpy())) + np.testing.assert_array_equal( + load_tensor3[2]['state_dict'][k].numpy(), v.numpy()) for k, v in state_dict.items(): - self.assertTrue( - np.array_equal(load_tensor3[2]["opt"][k].numpy(), v.numpy())) + np.testing.assert_array_equal(load_tensor3[2]['opt'][k].numpy(), + v.numpy()) - self.assertTrue(np.array_equal(load_tensor4[0].numpy(), obj4[0])) + np.testing.assert_array_equal(load_tensor4[0].numpy(), obj4[0]) load_array1 = paddle.load(path1, return_numpy=True) load_array2 = paddle.load(path2, return_numpy=True) load_array3 = paddle.load(path3, return_numpy=True) load_array4 = paddle.load(path4, return_numpy=True) - self.assertTrue(np.array_equal(load_array1[0], obj1[0].numpy())) - self.assertTrue(np.array_equal(load_array1[1], obj1[1])) - self.assertTrue(np.array_equal(load_array1[2], obj1[2][1])) + np.testing.assert_array_equal(load_array1[0], obj1[0].numpy()) + np.testing.assert_array_equal(load_array1[1], obj1[1]) + np.testing.assert_array_equal(load_array1[2], obj1[2][1]) for i in range(len(load_array1)): self.assertTrue(type(load_array1[i]) == type(load_array2['k1'][i])) for k, v in state_dict.items(): - self.assertTrue(np.array_equal(v.numpy(), load_array2['k2'][k])) + np.testing.assert_array_equal(v.numpy(), load_array2['k2'][k]) self.assertTrue(load_array2['epoch'] == 123) - self.assertTrue(np.array_equal(load_array3[0], obj3[0].numpy())) - self.assertTrue(np.array_equal(load_array3[1], obj3[1])) + np.testing.assert_array_equal(load_array3[0], obj3[0].numpy()) + np.testing.assert_array_equal(load_array3[1], obj3[1]) for k, v in state_dict.items(): - self.assertTrue( - np.array_equal(load_array3[2]["state_dict"][k], v.numpy())) + np.testing.assert_array_equal(load_array3[2]['state_dict'][k], + v.numpy()) for k, v in state_dict.items(): - self.assertTrue(np.array_equal(load_array3[2]["opt"][k], v.numpy())) + np.testing.assert_array_equal(load_array3[2]['opt'][k], v.numpy()) - self.assertTrue(np.array_equal(load_array4[0], obj4[0])) + np.testing.assert_array_equal(load_array4[0], obj4[0]) # static mode paddle.enable_static() @@ -532,69 +528,68 @@ def test_save_load_complex_object_dygraph_save(self): load_tensor3 = paddle.load(path3, return_numpy=False) load_tensor4 = paddle.load(path4, return_numpy=False) - self.assertTrue( - np.array_equal(np.array(load_tensor1[0]), obj1[0].numpy())) - self.assertTrue(np.array_equal(np.array(load_tensor1[1]), obj1[1])) - self.assertTrue(np.array_equal(np.array(load_tensor1[2]), obj1[2][1])) + np.testing.assert_array_equal(np.array(load_tensor1[0]), + obj1[0].numpy()) + np.testing.assert_array_equal(np.array(load_tensor1[1]), obj1[1]) + np.testing.assert_array_equal(np.array(load_tensor1[2]), obj1[2][1]) for i in range(len(load_tensor1)): self.assertTrue( type(load_tensor1[i]) == type(load_tensor2['k1'][i])) for k, v in state_dict.items(): - self.assertTrue( - np.array_equal(v.numpy(), np.array(load_tensor2['k2'][k]))) + np.testing.assert_array_equal(v.numpy(), + np.array(load_tensor2['k2'][k])) self.assertTrue(load_tensor2['epoch'] == 123) self.assertTrue(isinstance(load_tensor3[0], paddle.fluid.core.LoDTensor)) - self.assertTrue( - np.array_equal(np.array(load_tensor3[0]), obj3[0].numpy())) - self.assertTrue(np.array_equal(np.array(load_tensor3[1]), obj3[1])) + np.testing.assert_array_equal(np.array(load_tensor3[0]), + obj3[0].numpy()) + np.testing.assert_array_equal(np.array(load_tensor3[1]), obj3[1]) for k, v in state_dict.items(): self.assertTrue( isinstance(load_tensor3[2]["state_dict"][k], paddle.fluid.core.LoDTensor)) - self.assertTrue( - np.array_equal(np.array(load_tensor3[2]["state_dict"][k]), - v.numpy())) + np.testing.assert_array_equal( + np.array(load_tensor3[2]['state_dict'][k]), v.numpy()) for k, v in state_dict.items(): self.assertTrue( isinstance(load_tensor3[2]["opt"][k], paddle.fluid.core.LoDTensor)) - self.assertTrue( - np.array_equal(np.array(load_tensor3[2]["opt"][k]), v.numpy())) + np.testing.assert_array_equal(np.array(load_tensor3[2]['opt'][k]), + v.numpy()) self.assertTrue(load_tensor4[0], paddle.fluid.core.LoDTensor) - self.assertTrue(np.array_equal(np.array(load_tensor4[0]), obj4[0])) + np.testing.assert_array_equal(np.array(load_tensor4[0]), obj4[0]) load_array1 = paddle.load(path1, return_numpy=True) load_array2 = paddle.load(path2, return_numpy=True) load_array3 = paddle.load(path3, return_numpy=True) load_array4 = paddle.load(path4, return_numpy=True) - self.assertTrue(np.array_equal(load_array1[0], obj1[0].numpy())) - self.assertTrue(np.array_equal(load_array1[1], obj1[1])) - self.assertTrue(np.array_equal(load_array1[2], obj1[2][1])) + np.testing.assert_array_equal(load_array1[0], obj1[0].numpy()) + np.testing.assert_array_equal(load_array1[1], obj1[1]) + np.testing.assert_array_equal(load_array1[2], obj1[2][1]) for i in range(len(load_array1)): self.assertTrue(type(load_array1[i]) == type(load_array2['k1'][i])) for k, v in state_dict.items(): - self.assertTrue(np.array_equal(v.numpy(), load_array2['k2'][k])) + np.testing.assert_array_equal(v.numpy(), load_array2['k2'][k]) self.assertTrue(load_array2['epoch'] == 123) self.assertTrue(isinstance(load_array3[0], np.ndarray)) - self.assertTrue(np.array_equal(load_array3[0], obj3[0].numpy())) - self.assertTrue(np.array_equal(load_array3[1], obj3[1])) + np.testing.assert_array_equal(load_array3[0], obj3[0].numpy()) + np.testing.assert_array_equal(load_array3[1], obj3[1]) for k, v in state_dict.items(): - self.assertTrue( - np.array_equal(load_array3[2]["state_dict"][k], v.numpy())) + np.testing.assert_array_equal(load_array3[2]['state_dict'][k], + v.numpy()) for k, v in state_dict.items(): - self.assertTrue(np.array_equal(load_array3[2]["opt"][k], v.numpy())) + np.testing.assert_array_equal(load_array3[2]['opt'][k], v.numpy()) - self.assertTrue(np.array_equal(load_array4[0], obj4[0])) + np.testing.assert_array_equal(load_array4[0], obj4[0]) def test_save_load_complex_object_static_save(self): paddle.enable_static() @@ -649,72 +644,66 @@ def test_save_load_complex_object_static_save(self): load_tensor3 = paddle.load(path3, return_numpy=False) load_tensor4 = paddle.load(path4, return_numpy=False) - self.assertTrue( - np.array_equal(np.array(load_tensor1[0]), np.array(obj1[0]))) - self.assertTrue(np.array_equal(np.array(load_tensor1[1]), obj1[1])) - self.assertTrue( - np.array_equal(np.array(load_tensor1[2]), obj1[2][1])) + np.testing.assert_array_equal(np.array(load_tensor1[0]), + np.array(obj1[0])) + np.testing.assert_array_equal(np.array(load_tensor1[1]), obj1[1]) + np.testing.assert_array_equal(np.array(load_tensor1[2]), obj1[2][1]) for i in range(len(load_tensor1)): self.assertTrue( type(load_tensor1[i]) == type(load_tensor2['k1'][i])) for k, v in state_dict.items(): - self.assertTrue( - np.array_equal(np.array(v), - np.array(load_tensor2['k2'][k]))) + np.testing.assert_array_equal(np.array(v), + np.array(load_tensor2['k2'][k])) self.assertTrue(load_tensor2['epoch'] == 123) self.assertTrue(isinstance(load_tensor3[0], fluid.core.LoDTensor)) - self.assertTrue(np.array_equal(np.array(load_tensor3[0]), obj3[0])) + np.testing.assert_array_equal(np.array(load_tensor3[0]), obj3[0]) self.assertTrue(isinstance(load_tensor3[1], fluid.core.LoDTensor)) - self.assertTrue(np.array_equal(np.array(load_tensor3[1]), obj3[1])) + np.testing.assert_array_equal(np.array(load_tensor3[1]), obj3[1]) for k, v in state_dict.items(): self.assertTrue( isinstance(load_tensor3[2]["state_dict"][k], fluid.core.LoDTensor)) - self.assertTrue( - np.array_equal(np.array(load_tensor3[2]["state_dict"][k]), - np.array(v))) + np.testing.assert_array_equal( + np.array(load_tensor3[2]['state_dict'][k]), np.array(v)) for k, v in state_dict.items(): self.assertTrue( isinstance(load_tensor3[2]["opt"][k], fluid.core.LoDTensor)) - self.assertTrue( - np.array_equal(np.array(load_tensor3[2]["opt"][k]), - np.array(v))) + np.testing.assert_array_equal( + np.array(load_tensor3[2]['opt'][k]), np.array(v)) self.assertTrue(isinstance(load_tensor4[0], fluid.core.LoDTensor)) - self.assertTrue(np.array_equal(np.array(load_tensor4[0]), obj4[0])) + np.testing.assert_array_equal(np.array(load_tensor4[0]), obj4[0]) load_array1 = paddle.load(path1, return_numpy=True) load_array2 = paddle.load(path2, return_numpy=True) load_array3 = paddle.load(path3, return_numpy=True) load_array4 = paddle.load(path4, return_numpy=True) - self.assertTrue(np.array_equal(load_array1[0], np.array(obj1[0]))) - self.assertTrue(np.array_equal(load_array1[1], obj1[1])) - self.assertTrue(np.array_equal(load_array1[2], obj1[2][1])) + np.testing.assert_array_equal(load_array1[0], np.array(obj1[0])) + np.testing.assert_array_equal(load_array1[1], obj1[1]) + np.testing.assert_array_equal(load_array1[2], obj1[2][1]) for i in range(len(load_array1)): self.assertTrue( type(load_array1[i]) == type(load_array2['k1'][i])) for k, v in state_dict.items(): - self.assertTrue( - np.array_equal(np.array(v), load_array2['k2'][k])) + np.testing.assert_array_equal(np.array(v), load_array2['k2'][k]) self.assertTrue(load_array2['epoch'] == 123) - self.assertTrue(np.array_equal(load_array3[0], np.array(obj3[0]))) - self.assertTrue(np.array_equal(load_array3[1], obj3[1])) + np.testing.assert_array_equal(load_array3[0], np.array(obj3[0])) + np.testing.assert_array_equal(load_array3[1], obj3[1]) for k, v in state_dict.items(): - self.assertTrue( - np.array_equal(load_array3[2]["state_dict"][k], - np.array(v))) + np.testing.assert_array_equal(load_array3[2]['state_dict'][k], + np.array(v)) for k, v in state_dict.items(): - self.assertTrue( - np.array_equal(load_array3[2]["opt"][k], np.array(v))) + np.testing.assert_array_equal(load_array3[2]['opt'][k], + np.array(v)) - self.assertTrue(np.array_equal(load_array4[0], obj4[0])) + np.testing.assert_array_equal(load_array4[0], obj4[0]) # dygraph mode paddle.disable_static() @@ -724,79 +713,74 @@ def test_save_load_complex_object_static_save(self): load_tensor3 = paddle.load(path3, return_numpy=False) load_tensor4 = paddle.load(path4, return_numpy=False) - self.assertTrue( - np.array_equal(np.array(load_tensor1[0]), np.array(obj1[0]))) - self.assertTrue(np.array_equal(np.array(load_tensor1[1]), obj1[1])) - self.assertTrue(np.array_equal(load_tensor1[2].numpy(), obj1[2][1])) + np.testing.assert_array_equal(np.array(load_tensor1[0]), + np.array(obj1[0])) + np.testing.assert_array_equal(np.array(load_tensor1[1]), obj1[1]) + np.testing.assert_array_equal(load_tensor1[2].numpy(), obj1[2][1]) for i in range(len(load_tensor1)): self.assertTrue( type(load_tensor1[i]) == type(load_tensor2['k1'][i])) for k, v in state_dict.items(): - self.assertTrue( - np.array_equal(np.array(v), - np.array(load_tensor2['k2'][k]))) + np.testing.assert_array_equal(np.array(v), + np.array(load_tensor2['k2'][k])) self.assertTrue(load_tensor2['epoch'] == 123) self.assertTrue( isinstance(load_tensor3[0], (fluid.core.VarBase, fluid.core.eager.Tensor))) - self.assertTrue(np.array_equal(load_tensor3[0].numpy(), obj3[0])) + np.testing.assert_array_equal(load_tensor3[0].numpy(), obj3[0]) self.assertTrue( isinstance(load_tensor3[1], (fluid.core.VarBase, fluid.core.eager.Tensor))) - self.assertTrue(np.array_equal(load_tensor3[1].numpy(), obj3[1])) + np.testing.assert_array_equal(load_tensor3[1].numpy(), obj3[1]) for k, v in state_dict.items(): self.assertTrue( isinstance(load_tensor3[2]["state_dict"][k], (fluid.core.VarBase, fluid.core.eager.Tensor))) - self.assertTrue( - np.array_equal(load_tensor3[2]["state_dict"][k].numpy(), - np.array(v))) + np.testing.assert_array_equal( + load_tensor3[2]['state_dict'][k].numpy(), np.array(v)) for k, v in state_dict.items(): self.assertTrue( isinstance(load_tensor3[2]["opt"][k], (fluid.core.VarBase, fluid.core.eager.Tensor))) - self.assertTrue( - np.array_equal(load_tensor3[2]["opt"][k].numpy(), - np.array(v))) + np.testing.assert_array_equal(load_tensor3[2]['opt'][k].numpy(), + np.array(v)) self.assertTrue( isinstance(load_tensor4[0], (fluid.core.VarBase, fluid.core.eager.Tensor))) - self.assertTrue(np.array_equal(load_tensor4[0].numpy(), obj4[0])) + np.testing.assert_array_equal(load_tensor4[0].numpy(), obj4[0]) load_array1 = paddle.load(path1, return_numpy=True) load_array2 = paddle.load(path2, return_numpy=True) load_array3 = paddle.load(path3, return_numpy=True) load_array4 = paddle.load(path4, return_numpy=True) - self.assertTrue(np.array_equal(load_array1[0], np.array(obj1[0]))) - self.assertTrue(np.array_equal(load_array1[1], obj1[1])) - self.assertTrue(np.array_equal(load_array1[2], obj1[2][1])) + np.testing.assert_array_equal(load_array1[0], np.array(obj1[0])) + np.testing.assert_array_equal(load_array1[1], obj1[1]) + np.testing.assert_array_equal(load_array1[2], obj1[2][1]) for i in range(len(load_array1)): self.assertTrue( type(load_array1[i]) == type(load_array2['k1'][i])) for k, v in state_dict.items(): - self.assertTrue( - np.array_equal(np.array(v), load_array2['k2'][k])) + np.testing.assert_array_equal(np.array(v), load_array2['k2'][k]) self.assertTrue(load_array2['epoch'] == 123) - self.assertTrue(np.array_equal(load_array3[0], np.array(obj3[0]))) - self.assertTrue(np.array_equal(load_array3[1], obj3[1])) + np.testing.assert_array_equal(load_array3[0], np.array(obj3[0])) + np.testing.assert_array_equal(load_array3[1], obj3[1]) for k, v in state_dict.items(): - self.assertTrue( - np.array_equal(load_array3[2]["state_dict"][k], - np.array(v))) + np.testing.assert_array_equal(load_array3[2]['state_dict'][k], + np.array(v)) for k, v in state_dict.items(): - self.assertTrue( - np.array_equal(load_array3[2]["opt"][k], np.array(v))) + np.testing.assert_array_equal(load_array3[2]['opt'][k], + np.array(v)) self.assertTrue(isinstance(load_array4[0], np.ndarray)) - self.assertTrue(np.array_equal(load_array4[0], obj4[0])) + np.testing.assert_array_equal(load_array4[0], obj4[0]) def test_varbase_binary_var(self): paddle.disable_static() @@ -810,8 +794,8 @@ def test_varbase_binary_var(self): load_tensor_array = load_tensor.numpy() if paddle.fluid.core.is_compiled_with_cuda(): fluid.core._cuda_synchronize(paddle.CUDAPlace(0)) - self.assertTrue(np.array_equal(origin_array, load_array)) - self.assertTrue(np.array_equal(origin_array, load_tensor_array)) + np.testing.assert_array_equal(origin_array, load_array) + np.testing.assert_array_equal(origin_array, load_tensor_array) class TestSaveLoadToMemory(unittest.TestCase): @@ -828,10 +812,10 @@ def test_dygraph_save_to_memory(self): # load state_dict dict_load = paddle.load(byio, return_numpy=True) for k, v in state_dict.items(): - self.assertTrue(np.array_equal(v.numpy(), dict_load[k])) + np.testing.assert_array_equal(v.numpy(), dict_load[k]) # load tensor tensor_load = paddle.load(byio, return_numpy=True) - self.assertTrue(np.array_equal(tensor_load, tensor.numpy())) + np.testing.assert_array_equal(tensor_load, tensor.numpy()) with self.assertRaises(ValueError): paddle.save(4, 3) @@ -874,11 +858,11 @@ def test_static_save_to_memory(self): prog_load.desc.serialize_to_string()) tensor_load = paddle.load(byio, return_numpy=True) - self.assertTrue(np.array_equal(tensor_load, np.array(tensor))) + np.testing.assert_array_equal(tensor_load, np.array(tensor)) state_dict_load = paddle.load(byio, return_numpy=True) for k, v in state_dict.items(): - self.assertTrue(np.array_equal(np.array(v), state_dict_load[k])) + np.testing.assert_array_equal(np.array(v), state_dict_load[k]) class TestSaveLoad(unittest.TestCase): @@ -915,7 +899,7 @@ def check_load_state_dict(self, orig_dict, load_dict): for var_name, value in orig_dict.items(): load_value = load_dict[var_name].numpy() if hasattr( load_dict[var_name], 'numpy') else np.array(load_dict[var_name]) - self.assertTrue(np.array_equal(value.numpy(), load_value)) + np.testing.assert_array_equal(value.numpy(), load_value) def test_save_load(self): layer, opt = self.build_and_train_model() diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py index c7ac11546e12e..4357b9925d3ee 100644 --- a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py +++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py @@ -120,7 +120,7 @@ def test_replace_save_load_vars(self): var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) # test for io.save_vars/replace_load_vars path_vars2 = os.path.join( self.temp_dir.name, @@ -137,7 +137,7 @@ def test_replace_save_load_vars(self): var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) def test_save_load_lod_tensor(self): paddle.enable_static() @@ -177,7 +177,7 @@ def test_save_load_lod_tensor(self): self.assertTrue( list(loaded_tensor.shape()) == [IMAGE_SIZE, OUTPUT_NUM]) to_array = np.array(loaded_tensor) - self.assertTrue(np.array_equal(origin, to_array)) + np.testing.assert_array_equal(origin, to_array) with self.assertRaises(NotImplementedError): path = os.path.join(self.temp_dir.name, 'test_save_load_error/temp') @@ -216,7 +216,7 @@ def test_save_load_lod_tensor(self): # load from memory loaded_tensor_mem = paddle.load(byio) to_array_mem = np.array(loaded_tensor_mem) - self.assertTrue(np.array_equal(np.array(tensor), to_array_mem)) + np.testing.assert_array_equal(np.array(tensor), to_array_mem) with self.assertRaises(NotImplementedError): paddle.framework.io._save_lod_tensor(tensor, 1) @@ -247,8 +247,7 @@ def test_save_load_selected_rows(self): self.assertTrue(isinstance(load_sr, fluid.core.SelectedRows)) self.assertTrue(list(load_sr.rows()) == rows) self.assertTrue(load_sr.height() == height) - self.assertTrue(np.array_equal(np.array(load_sr.get_tensor()), - np_array)) + np.testing.assert_array_equal(np.array(load_sr.get_tensor()), np_array) with self.assertRaises(RuntimeError): fluid.core.save_selected_rows( @@ -273,8 +272,8 @@ def test_save_load_selected_rows(self): self.assertTrue(isinstance(selected_rows_mem, fluid.core.SelectedRows)) self.assertTrue(list(selected_rows_mem.rows()) == rows) self.assertTrue(selected_rows_mem.height() == height) - self.assertTrue( - np.array_equal(np.array(selected_rows_mem.get_tensor()), np_array)) + np.testing.assert_array_equal(np.array(selected_rows_mem.get_tensor()), + np_array) with self.assertRaises(NotImplementedError): paddle.framework.io._save_selected_rows(selected_rows, 1) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py index 1a015369ec679..0a1b2a5d4591c 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py @@ -103,7 +103,7 @@ def run_impl(self, use_gpu, dev_cnt, is_training, use_experimental_executor, }, fetch_list=[loss, isolated_var]) - self.assertTrue(np.array_equal(y_np, y_np_fetch)) + np.testing.assert_array_equal(y_np, y_np_fetch) enable_parallel_ssa_executor(False) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_inference_feed_partial_data.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_inference_feed_partial_data.py index a3a26f481f3f1..5500f6718041a 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_inference_feed_partial_data.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_inference_feed_partial_data.py @@ -49,12 +49,12 @@ def run_network(self, places, use_split, has_persistable): gen_random = lambda shape: np.random.uniform( low=-1.0, high=1.0, size=shape).astype('float32') - assert_result = lambda feed, result: self.assertTrue( - np.array_equal(np.maximum(0, feed), result)) + assert_result = lambda feed, result: np.testing.assert_array_equal( + np.maximum(0, feed), result) def assert_merged_unmerged(merged, unmerged): unmerged = np.concatenate(unmerged, axis=0) - self.assertTrue(np.array_equal(merged, unmerged)) + np.testing.assert_array_equal(merged, unmerged) def feed_split_test(): for place_num in six.moves.range(1, len(places) * 3): diff --git a/python/paddle/fluid/tests/unittests/test_parameter.py b/python/paddle/fluid/tests/unittests/test_parameter.py index d75a6c0dd90f4..0115f7fec519c 100644 --- a/python/paddle/fluid/tests/unittests/test_parameter.py +++ b/python/paddle/fluid/tests/unittests/test_parameter.py @@ -46,9 +46,9 @@ def test_parameter(self): self.assertEqual(0, param.block.idx) exe = Executor(paddle.CPUPlace()) p = exe.run(main_program, fetch_list=[param])[0] - self.assertTrue(np.array_equal(p, np.ones(shape) * val)) + np.testing.assert_array_equal(p, np.ones(shape) * val) p = io.get_parameter_value_by_name('fc.w', exe, main_program) - self.assertTrue(np.array_equal(p, np.ones(shape) * val)) + np.testing.assert_array_equal(p, np.ones(shape) * val) def func_parambase(self): with guard(): @@ -61,7 +61,7 @@ def func_parambase(self): self.assertEqual(param_copy.type, param.type) self.assertEqual(param_copy.dtype, param.dtype) self.assertEqual(str(param_copy.place), str(param.place)) - self.assertTrue(np.array_equal(param_copy.numpy(), param.numpy())) + np.testing.assert_array_equal(param_copy.numpy(), param.numpy()) self.assertEqual(param_copy.optimize_attr, param.optimize_attr) self.assertEqual(param_copy.regularizer, param.regularizer) self.assertEqual(param_copy.do_model_average, @@ -116,12 +116,10 @@ def func_parambase_to_vector(self): paddle.nn.utils.vector_to_parameters(vec, linear2.parameters()) self.assertEqual(linear2.weight.shape, [10, 15]) self.assertEqual(linear2.bias.shape, [15]) - self.assertTrue( - np.array_equal(linear1.weight.numpy(), linear2.weight.numpy()), - True) - self.assertTrue( - np.array_equal(linear1.bias.numpy(), linear2.bias.numpy()), - True) + np.testing.assert_array_equal(linear1.weight.numpy(), + linear2.weight.numpy()) + np.testing.assert_array_equal(linear1.bias.numpy(), + linear2.bias.numpy()) self.assertTrue(linear2.weight.is_leaf, True) self.assertTrue(linear2.bias.is_leaf, True) diff --git a/python/paddle/fluid/tests/unittests/test_poisson_op.py b/python/paddle/fluid/tests/unittests/test_poisson_op.py index 51f19747f2b66..764ba03d40161 100644 --- a/python/paddle/fluid/tests/unittests/test_poisson_op.py +++ b/python/paddle/fluid/tests/unittests/test_poisson_op.py @@ -108,7 +108,7 @@ def test_dygraph(self): y = paddle.poisson(x) y.backward() self.assertTrue(np.min(y.numpy()) >= 0) - self.assertTrue(np.array_equal(np.zeros_like(x), x.gradient())) + np.testing.assert_array_equal(np.zeros_like(x), x.gradient()) def test_fixed_random_number(self): # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' @@ -127,31 +127,31 @@ def test_fixed_random_number(self): 13., 13., 11., 8., 12., 6., 9., 15., 16., 6., 13., 12., 9., 15., 17., 8., 11., 16., 11., 10. ] - self.assertTrue(np.array_equal(y_np[0, 0, 0, 0:20], expect)) + np.testing.assert_array_equal(y_np[0, 0, 0, 0:20], expect) expect = [ 15., 7., 12., 8., 14., 10., 10., 11., 11., 11., 21., 6., 9., 13., 13., 11., 6., 9., 12., 12. ] - self.assertTrue(np.array_equal(y_np[8, 1, 300, 200:220], expect)) + np.testing.assert_array_equal(y_np[8, 1, 300, 200:220], expect) expect = [ 10., 15., 9., 6., 4., 13., 10., 10., 13., 12., 9., 7., 10., 14., 7., 10., 8., 5., 10., 14. ] - self.assertTrue(np.array_equal(y_np[16, 1, 600, 400:420], expect)) + np.testing.assert_array_equal(y_np[16, 1, 600, 400:420], expect) expect = [ 10., 9., 14., 12., 8., 9., 7., 8., 11., 10., 13., 8., 12., 9., 7., 8., 11., 11., 12., 5. ] - self.assertTrue(np.array_equal(y_np[24, 2, 900, 600:620], expect)) + np.testing.assert_array_equal(y_np[24, 2, 900, 600:620], expect) expect = [ 15., 5., 11., 13., 12., 12., 13., 16., 9., 9., 7., 9., 13., 11., 15., 6., 11., 9., 10., 10. ] - self.assertTrue(np.array_equal(y_np[31, 2, 1023, 748:768], expect)) + np.testing.assert_array_equal(y_np[31, 2, 1023, 748:768], expect) x = paddle.full([16, 1024, 1024], 5., dtype="float32") y = paddle.poisson(x) @@ -160,31 +160,31 @@ def test_fixed_random_number(self): 4., 5., 2., 9., 8., 7., 4., 7., 4., 7., 6., 3., 10., 7., 5., 7., 2., 5., 5., 6. ] - self.assertTrue(np.array_equal(y_np[0, 0, 100:120], expect)) + np.testing.assert_array_equal(y_np[0, 0, 100:120], expect) expect = [ 1., 4., 8., 11., 6., 5., 4., 4., 7., 4., 4., 7., 11., 6., 5., 3., 4., 6., 3., 3. ] - self.assertTrue(np.array_equal(y_np[4, 300, 300:320], expect)) + np.testing.assert_array_equal(y_np[4, 300, 300:320], expect) expect = [ 7., 5., 4., 6., 8., 5., 6., 7., 7., 7., 3., 10., 5., 10., 4., 5., 8., 7., 5., 7. ] - self.assertTrue(np.array_equal(y_np[8, 600, 600:620], expect)) + np.testing.assert_array_equal(y_np[8, 600, 600:620], expect) expect = [ 8., 6., 7., 4., 3., 0., 4., 6., 6., 4., 3., 10., 5., 1., 3., 8., 8., 2., 1., 4. ] - self.assertTrue(np.array_equal(y_np[12, 900, 900:920], expect)) + np.testing.assert_array_equal(y_np[12, 900, 900:920], expect) expect = [ 2., 1., 14., 3., 6., 5., 2., 2., 6., 5., 7., 4., 8., 4., 8., 4., 5., 7., 1., 7. ] - self.assertTrue(np.array_equal(y_np[15, 1023, 1000:1020], expect)) + np.testing.assert_array_equal(y_np[15, 1023, 1000:1020], expect) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_program.py b/python/paddle/fluid/tests/unittests/test_program.py index cd00af1ed96da..cbfecf0816909 100644 --- a/python/paddle/fluid/tests/unittests/test_program.py +++ b/python/paddle/fluid/tests/unittests/test_program.py @@ -202,5 +202,44 @@ def net(): self.assertFalse(var.has_stop_gradient()) +def build_program(): + main_program = paddle.static.Program() + startuo_program = paddle.static.Program() + with paddle.utils.unique_name.guard(): + with paddle.static.program_guard(main_program, startuo_program): + x = paddle.static.data(name='x', shape=[3, 2, 1]) + out = paddle.static.nn.fc(x=x, size=1, num_flatten_dims=2) + return main_program + + +class TestProgramProto(unittest.TestCase): + + def test_update_op(self): + program = build_program() + a = program.desc.serialize_to_string() + program.current_block().ops[0]._set_attr('use_mkldnn', True) + self.assertTrue(program.desc.need_update()) + b = program.desc.serialize_to_string() + self.assertFalse(a == b) + + def test_update_var(self): + program = build_program() + a = program.desc.serialize_to_string() + program.current_block().var("x").desc.set_stop_gradient(False) + self.assertTrue(program.desc.need_update()) + b = program.desc.serialize_to_string() + self.assertFalse(a == b) + + # it seems the attrs of framework::VarDesc is not write to proto, + # except for persistable/need_check_feed/is_parameter/stop_gradient + def test_update_var_attr(self): + program = build_program() + a = program.desc.serialize_to_string() + program.current_block().var("x").desc._set_attr("a", 1) + self.assertFalse(program.desc.need_update()) + b = program.desc.serialize_to_string() + self.assertTrue(a == b) # not affected + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_prune.py b/python/paddle/fluid/tests/unittests/test_prune.py index 730a6c1b8a8ff..180988a2aa589 100644 --- a/python/paddle/fluid/tests/unittests/test_prune.py +++ b/python/paddle/fluid/tests/unittests/test_prune.py @@ -219,8 +219,8 @@ def test_prune_fetches_without_optimizer(self): self.assertIsNone(scope.find_var(loss2.name)) #loss2 is pruned weight = np.array( scope.find_var(w_param_attrs.name).get_tensor()) - self.assertTrue(np.array_equal(weight_init, - weight)) # weight not changed + np.testing.assert_array_equal(weight_init, + weight) # weight not changed def test_prune_fetches_with_optimizer(self): """ @@ -311,8 +311,8 @@ def test_prune_feed_without_optimizer(self): self.assertIsNone(scope.find_var(loss2.name)) weight = np.array( scope.find_var(w_param_attrs.name).get_tensor()) - self.assertTrue(np.array_equal(weight_init, - weight)) # weight unchanged + np.testing.assert_array_equal(weight_init, + weight) # weight unchanged def test_prune_feed_with_optimizer(self): program = framework.Program() @@ -527,7 +527,7 @@ def test_prune_with_multi_optimizers(self): weight_expected = np.array( scope.find_var(w_param_attrs.name).get_tensor()) - self.assertTrue(np.array_equal(weight_with_prune, weight_expected)) + np.testing.assert_array_equal(weight_with_prune, weight_expected) self.assertFalse(np.array_equal(weight_without_prune, weight_expected)) def test_prune_with_multi_devices(self): @@ -661,7 +661,7 @@ def test_prune_program_with_tupe_in_fetch_list(self): weight_expected = np.array( scope.find_var(w_param_attrs.name).get_tensor()) - self.assertTrue(np.array_equal(weight_with_prune, weight_expected)) + np.testing.assert_array_equal(weight_with_prune, weight_expected) self.assertFalse(np.array_equal(weight_without_prune, weight_expected)) def test_prune_program_partial_parameter_updated(self): @@ -708,8 +708,8 @@ def test_prune_program_partial_parameter_updated(self): scope.find_var(w2_param_attrs.name).get_tensor()) self.assertFalse(np.array_equal(weight1_init, weight1)) # weight changed - self.assertTrue(np.array_equal(weight2_init, - weight2)) # weight2 unchanged + np.testing.assert_array_equal(weight2_init, + weight2) # weight2 unchanged def test_prune_override_use_prune(self): ''' @@ -768,7 +768,7 @@ def test_prune_override_use_prune(self): weight_expected = np.array( scope.find_var(w_param_attrs.name).get_tensor()) - self.assertTrue(np.array_equal(weight_with_prune, weight_expected)) + np.testing.assert_array_equal(weight_with_prune, weight_expected) self.assertFalse(np.array_equal(weight_without_prune, weight_expected)) def test_prune_feed_var_in_fetchlist_1(self): @@ -797,8 +797,8 @@ def test_prune_feed_var_in_fetchlist_1(self): self.assertIsNone(scope.find_var(x.name)) weight = np.array( scope.find_var(w_param_attrs.name).get_tensor()) - self.assertTrue(np.array_equal(weight_init, - weight)) # weight unchanged + np.testing.assert_array_equal(weight_init, + weight) # weight unchanged def test_prune_feed_var_in_fetchlist_2(self): # the variable to be fed is leaf @@ -825,8 +825,8 @@ def test_prune_feed_var_in_fetchlist_2(self): self.assertIsNone(scope.find_var(loss2.name)) weight = np.array( scope.find_var(w_param_attrs.name).get_tensor()) - self.assertTrue(np.array_equal(weight_init, - weight)) # weight unchanged + np.testing.assert_array_equal(weight_init, + weight) # weight unchanged if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_combination.py b/python/paddle/fluid/tests/unittests/test_py_reader_combination.py index d3bc50bffe7cd..84ea5f5f7bb7e 100644 --- a/python/paddle/fluid/tests/unittests/test_py_reader_combination.py +++ b/python/paddle/fluid/tests/unittests/test_py_reader_combination.py @@ -46,8 +46,8 @@ def assertFeedVarEqual(self, reader_list_data, py_reader_dict_data): image2 = np.array(py_reader_dict_data[0]['image']) label2 = np.array(py_reader_dict_data[0]['label']) - self.assertTrue(np.array_equal(image1, image2)) - self.assertTrue(np.array_equal(label1, label2)) + np.testing.assert_array_equal(image1, image2) + np.testing.assert_array_equal(label1, label2) # FIXME(zjl): do not know why Python 35 would raise SIGABRT if not reset reader # manually. diff --git a/python/paddle/fluid/tests/unittests/test_randint_op.py b/python/paddle/fluid/tests/unittests/test_randint_op.py index f5d18a9268f40..c3b4d02af1d4a 100644 --- a/python/paddle/fluid/tests/unittests/test_randint_op.py +++ b/python/paddle/fluid/tests/unittests/test_randint_op.py @@ -232,22 +232,22 @@ def run_test_case(self): self.assertTrue(x.mean(), -0.7517569760481516) self.assertTrue(x.std(), 5773.696619107639) expect = [2535, 2109, 5916, -5011, -261] - self.assertTrue(np.array_equal(x[10, 0, 100, 100:105], expect)) + np.testing.assert_array_equal(x[10, 0, 100, 100:105], expect) expect = [3465, 7206, -8660, -9628, -6574] - self.assertTrue(np.array_equal(x[20, 1, 600, 600:605], expect)) + np.testing.assert_array_equal(x[20, 1, 600, 600:605], expect) expect = [881, 1560, 1100, 9664, 1669] - self.assertTrue(np.array_equal(x[30, 2, 1000, 1000:1005], expect)) + np.testing.assert_array_equal(x[30, 2, 1000, 1000:1005], expect) x = paddle.randint(-10000, 10000, [32, 3, 1024, 1024], dtype='int64').numpy() self.assertTrue(x.mean(), -1.461287518342336) self.assertTrue(x.std(), 5773.023477548159) expect = [7213, -9597, 754, 8129, -1158] - self.assertTrue(np.array_equal(x[10, 0, 100, 100:105], expect)) + np.testing.assert_array_equal(x[10, 0, 100, 100:105], expect) expect = [-7159, 8054, 7675, 6980, 8506] - self.assertTrue(np.array_equal(x[20, 1, 600, 600:605], expect)) + np.testing.assert_array_equal(x[20, 1, 600, 600:605], expect) expect = [3581, 3420, -8027, -5237, -2436] - self.assertTrue(np.array_equal(x[30, 2, 1000, 1000:1005], expect)) + np.testing.assert_array_equal(x[30, 2, 1000, 1000:1005], expect) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_randperm_op.py b/python/paddle/fluid/tests/unittests/test_randperm_op.py index 5a75e83939711..efbb12353da55 100644 --- a/python/paddle/fluid/tests/unittests/test_randperm_op.py +++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py @@ -173,61 +173,61 @@ def test_fixed_random_number(self): expect = [ 24562, 8409, 9379, 10328, 20503, 18059, 9681, 21883, 11783, 27413 ] - self.assertTrue(np.array_equal(x[0:10], expect)) + np.testing.assert_array_equal(x[0:10], expect) expect = [ 29477, 27100, 9643, 16637, 8605, 16892, 27767, 2724, 1612, 13096 ] - self.assertTrue(np.array_equal(x[10000:10010], expect)) + np.testing.assert_array_equal(x[10000:10010], expect) expect = [ 298, 4104, 16479, 22714, 28684, 7510, 14667, 9950, 15940, 28343 ] - self.assertTrue(np.array_equal(x[20000:20010], expect)) + np.testing.assert_array_equal(x[20000:20010], expect) x = paddle.randperm(30000, dtype='int64').numpy() expect = [ 6587, 1909, 5525, 23001, 6488, 14981, 14355, 3083, 29561, 8171 ] - self.assertTrue(np.array_equal(x[0:10], expect)) + np.testing.assert_array_equal(x[0:10], expect) expect = [ 23460, 12394, 22501, 5427, 20185, 9100, 5127, 1651, 25806, 4818 ] - self.assertTrue(np.array_equal(x[10000:10010], expect)) + np.testing.assert_array_equal(x[10000:10010], expect) expect = [5829, 4508, 16193, 24836, 8526, 242, 9984, 9243, 1977, 11839] - self.assertTrue(np.array_equal(x[20000:20010], expect)) + np.testing.assert_array_equal(x[20000:20010], expect) x = paddle.randperm(30000, dtype='float32').numpy() expect = [ 5154., 10537., 14362., 29843., 27185., 28399., 27561., 4144., 22906., 10705. ] - self.assertTrue(np.array_equal(x[0:10], expect)) + np.testing.assert_array_equal(x[0:10], expect) expect = [ 1958., 18414., 20090., 21910., 22746., 27346., 22347., 3002., 4564., 26991. ] - self.assertTrue(np.array_equal(x[10000:10010], expect)) + np.testing.assert_array_equal(x[10000:10010], expect) expect = [ 25580., 12606., 553., 16387., 29536., 4241., 20946., 16899., 16339., 4662. ] - self.assertTrue(np.array_equal(x[20000:20010], expect)) + np.testing.assert_array_equal(x[20000:20010], expect) x = paddle.randperm(30000, dtype='float64').numpy() expect = [ 19051., 2449., 21940., 11121., 282., 7330., 13747., 24321., 21147., 9163. ] - self.assertTrue(np.array_equal(x[0:10], expect)) + np.testing.assert_array_equal(x[0:10], expect) expect = [ 15483., 1315., 5723., 20954., 13251., 25539., 5074., 1823., 14945., 17624. ] - self.assertTrue(np.array_equal(x[10000:10010], expect)) + np.testing.assert_array_equal(x[10000:10010], expect) expect = [ 10516., 2552., 29970., 5941., 986., 8007., 24805., 26753., 12202., 21404. ] - self.assertTrue(np.array_equal(x[20000:20010], expect)) + np.testing.assert_array_equal(x[20000:20010], expect) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_real_imag_op.py b/python/paddle/fluid/tests/unittests/test_real_imag_op.py index 1402585c03745..f3226158ba353 100644 --- a/python/paddle/fluid/tests/unittests/test_real_imag_op.py +++ b/python/paddle/fluid/tests/unittests/test_real_imag_op.py @@ -119,7 +119,7 @@ def init_input_output(dtype): exe = static.Executor(place) out_value = exe.run(feed=input_dict, fetch_list=[out.name]) - self.assertTrue(np.array_equal(np_res, out_value[0])) + np.testing.assert_array_equal(np_res, out_value[0]) def test_in_dynamic_mode(self): for dtype in self.dtypes: @@ -131,10 +131,10 @@ def test_in_dynamic_mode(self): with fluid.dygraph.guard(place): input_t = paddle.to_tensor(input) res = paddle_apis[self.api](input_t).numpy() - self.assertTrue(np.array_equal(np_res, res)) + np.testing.assert_array_equal(np_res, res) res_t = input_t.real().numpy( ) if self.api is "real" else input_t.imag().numpy() - self.assertTrue(np.array_equal(np_res, res_t)) + np.testing.assert_array_equal(np_res, res_t) def test_name_argument(self): with static.program_guard(static.Program()): diff --git a/python/paddle/fluid/tests/unittests/test_reverse_op.py b/python/paddle/fluid/tests/unittests/test_reverse_op.py index adc9e513eaae4..57b87bd896347 100644 --- a/python/paddle/fluid/tests/unittests/test_reverse_op.py +++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py @@ -181,10 +181,10 @@ def check_output(self, gt, res): arr_len = len(res) - 1 reversed_array = res[-1] # check output - self.assertTrue(np.array_equal(gt, reversed_array)) + np.testing.assert_array_equal(gt, reversed_array) # check grad for i in range(arr_len): - self.assertTrue(np.array_equal(res[i], np.ones_like(res[i]))) + np.testing.assert_array_equal(res[i], np.ones_like(res[i])) def test_raise_error(self): # The len(axis) should be 1 is input(X) is LoDTensorArray diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py index 75b8b169cb2e1..2a9f69bff1eba 100644 --- a/python/paddle/fluid/tests/unittests/test_run.py +++ b/python/paddle/fluid/tests/unittests/test_run.py @@ -52,8 +52,8 @@ def write_file(name, ct): def get_files(pth, prefix): return [ - f for f in listdir(pth) if isfile(join(pth, f)) and f.startswith(prefix) - and f != f"{prefix}.gpu.log" + f for f in listdir(pth) + if isfile(join(pth, f)) and not f.endswith('gpu.log') ] @@ -101,17 +101,19 @@ def test_collective_2(self): def test_collective_3(self): log_dir = tempfile.TemporaryDirectory() port = random.randrange(6000, 8000) - args = "--job_id test3 --devices 0,1 --log_dir {} --master 127.0.0.1:{} --nnodes 2".format( - log_dir.name, port) - p1 = self.pdrun(args) - p2 = self.pdrun(args) + args = "--job_id test3 --devices 0,1 --log_dir {} --master 127.0.0.1:{} --nnodes 2" + p1 = self.pdrun(args.format(log_dir.name + "/1", port)) + p2 = self.pdrun(args.format(log_dir.name + "/2", port)) p1.wait() p2.wait() self.assertTrue(p1.poll() == 0) self.assertTrue(p2.poll() == 0) - c = get_files(log_dir.name, 'test3') - self.assertTrue(len(c) == 6) + c1 = get_files(log_dir.name + "/1", 'test3') + c2 = get_files(log_dir.name + "/2", 'test3') + print(c1) + self.assertTrue(len(c1) == 3) + self.assertTrue(len(c2) == 3) log_dir.cleanup() @@ -156,17 +158,19 @@ def test_ps_2(self): def test_ps_3(self): log_dir = tempfile.TemporaryDirectory() port = random.randrange(6000, 8000) - args = "--job_id ps3 --log_dir {} --master 127.0.0.1:{} --nnodes 2 --server_num=1 --trainer_num=1".format( - log_dir.name, port) - p1 = self.pdrun(args) - p2 = self.pdrun(args) + args = "--job_id ps3 --log_dir {} --master 127.0.0.1:{} --nnodes 2 --server_num=1 --trainer_num=1" + p1 = self.pdrun(args.format(log_dir.name + "/1", port)) + p2 = self.pdrun(args.format(log_dir.name + "/2", port)) p1.wait() p2.wait() self.assertTrue(p1.poll() == 0) self.assertTrue(p2.poll() == 0) - c = get_files(log_dir.name, 'ps3') - self.assertTrue(len(c) == 6) + c1 = get_files(log_dir.name + "/1", 'ps3') + c2 = get_files(log_dir.name + "/2", 'ps3') + print(c1) + self.assertTrue(len(c1) == 3) + self.assertTrue(len(c2) == 3) log_dir.cleanup() def test_ps_4(self): @@ -178,6 +182,7 @@ def test_ps_4(self): self.assertTrue(p1.poll() == 0) c = get_files(log_dir.name, 'ps4') + print(c) self.assertTrue(len(c) == 5) log_dir.cleanup() diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py index f00b5fdc436e8..0beee7b0a1ccb 100644 --- a/python/paddle/fluid/tests/unittests/test_scale_op.py +++ b/python/paddle/fluid/tests/unittests/test_scale_op.py @@ -215,7 +215,7 @@ def test_api(self): exe = paddle.static.Executor(place=paddle.CPUPlace()) out = exe.run(main_prog, feed={"x": input}, fetch_list=[out]) - self.assertEqual(np.array_equal(out[0], input * 2.0 + 3.0), True) + np.testing.assert_array_equal(out[0], input * 2.0 + 3.0) class TestScaleInplaceApiStatic(TestScaleApiStatic): @@ -234,7 +234,7 @@ def test_api(self): input = np.random.random([2, 25]).astype("float32") x = paddle.to_tensor(input) out = self._executed_api(x, scale=2.0, bias=3.0) - self.assertEqual(np.array_equal(out.numpy(), input * 2.0 + 3.0), True) + np.testing.assert_array_equal(out.numpy(), input * 2.0 + 3.0) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py index 1833f36013d27..ecf67c71cbdae 100644 --- a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py +++ b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py @@ -244,8 +244,7 @@ def testcase5(self): cpu_value = paddle.scatter_nd_add(paddle.to_tensor(x), paddle.to_tensor(index), paddle.to_tensor(val)) - self.assertTrue(np.array_equal(gpu_value.numpy(), - cpu_value.numpy())) + np.testing.assert_array_equal(gpu_value.numpy(), cpu_value.numpy()) paddle.set_device(device) @switch_to_static_graph @@ -267,7 +266,7 @@ def test_static_graph(): gpu_value = gpu_exe.run(feed=feed, fetch_list=fetch)[0] cpu_exe = paddle.static.Executor(paddle.CPUPlace()) cpu_value = cpu_exe.run(feed=feed, fetch_list=fetch)[0] - self.assertTrue(np.array_equal(gpu_value, cpu_value)) + np.testing.assert_array_equal(gpu_value, cpu_value) test_static_graph() diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py index 2fe162d809019..d54ed142178bc 100644 --- a/python/paddle/fluid/tests/unittests/test_scatter_op.py +++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py @@ -289,7 +289,7 @@ def test_static_graph(): gpu_value = gpu_exe.run(feed=feed, fetch_list=fetch)[0] return gpu_value - self.assertTrue(np.array_equal(test_dygraph(), test_static_graph())) + np.testing.assert_array_equal(test_dygraph(), test_static_graph()) @unittest.skipIf(not core.is_compiled_with_cuda(), diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py index 9aee71af41675..59ccff3973ff0 100644 --- a/python/paddle/fluid/tests/unittests/test_set_value_op.py +++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py @@ -1133,13 +1133,15 @@ def set_value(t, value): [[2916., 4000., 5324., 6912.]]]], [[[[0., 0., 0., 0.]], [[0., 0., 0., 0.]], [[0., 0., 0., 0.]]]]]]) - self.assertTrue( - np.array_equal(inps.grad.numpy(), input_grad), - msg="The gradient of value should be \n{},\n but reveived {}". + np.testing.assert_array_equal( + inps.grad.numpy(), + input_grad, + err_msg='The gradient of value should be \n{},\n but reveived {}'. format(input_grad, inps.grad.numpy())) - self.assertTrue( - np.array_equal(value.grad.numpy(), value_grad), - msg="The gradient of input should be \n{},\n but reveived {}". + np.testing.assert_array_equal( + value.grad.numpy(), + value_grad, + err_msg='The gradient of input should be \n{},\n but reveived {}'. format(value_grad, value.grad.numpy())) # case 2 @@ -1159,13 +1161,15 @@ def set_value(t, value): [16384., 19652., 23328.]], [[27436., 32000., 37044.], [42592., 48668., 55296.]]]) - self.assertTrue( - np.array_equal(inps2.grad.numpy(), input_grad2), - msg="The gradient of value should be \n{},\n but reveived {}". + np.testing.assert_array_equal( + inps2.grad.numpy(), + input_grad2, + err_msg='The gradient of value should be \n{},\n but reveived {}'. format(input_grad, inps2.grad.numpy())) - self.assertTrue( - np.array_equal(value2.grad.numpy(), value_grad2), - msg="The gradient of input should be \n{},\n but reveived {}". + np.testing.assert_array_equal( + value2.grad.numpy(), + value_grad2, + err_msg='The gradient of input should be \n{},\n but reveived {}'. format(value_grad, value2.grad.numpy())) # case 3 @@ -1196,13 +1200,15 @@ def set_value3(t, value): [[[[[27436.], [32000.]]]], [[[[37044.], [42592.]]]], [[[[48668.], [55296.]]]]]]) - self.assertTrue( - np.array_equal(inps.grad.numpy(), input_grad), - msg="The gradient of value should be \n{},\n but reveived {}". + np.testing.assert_array_equal( + inps.grad.numpy(), + input_grad, + err_msg='The gradient of value should be \n{},\n but reveived {}'. format(input_grad, inps.grad.numpy())) - self.assertTrue( - np.array_equal(value.grad.numpy(), value_grad), - msg="The gradient of input should be \n{},\n but reveived {}". + np.testing.assert_array_equal( + value.grad.numpy(), + value_grad, + err_msg='The gradient of input should be \n{},\n but reveived {}'. format(value_grad, value.grad.numpy())) #case 4: step >0 @@ -1229,13 +1235,15 @@ def set_value4(t, value): [[[[8788.], [10976.], [13500.], [16384.]]], [[[19652.], [23328.], [27436.], [32000.]]], [[[37044.], [42592.], [48668.], [55296.]]]]]) - self.assertTrue( - np.array_equal(inps.grad.numpy(), input_grad), - msg="The gradient of value should be \n{},\n but reveived {}". + np.testing.assert_array_equal( + inps.grad.numpy(), + input_grad, + err_msg='The gradient of value should be \n{},\n but reveived {}'. format(input_grad, inps.grad.numpy())) - self.assertTrue( - np.array_equal(value.grad.numpy(), value_grad), - msg="The gradient of input should be \n{},\n but reveived {}". + np.testing.assert_array_equal( + value.grad.numpy(), + value_grad, + err_msg='The gradient of input should be \n{},\n but reveived {}'. format(value_grad, value.grad.numpy())) # case 5:a[0].shape==value.shape @@ -1262,13 +1270,15 @@ def set_value5(t, value): [[8788., 10976., 13500., 16384.], [19652., 23328., 27436., 32000.], [37044., 42592., 48668., 55296.]]]) - self.assertTrue( - np.array_equal(inps.grad.numpy(), input_grad), - msg="The gradient of value should be \n{},\n but reveived {}". + np.testing.assert_array_equal( + inps.grad.numpy(), + input_grad, + err_msg='The gradient of value should be \n{},\n but reveived {}'. format(input_grad, inps.grad.numpy())) - self.assertTrue( - np.array_equal(value.grad.numpy(), value_grad), - msg="The gradient of input should be \n{},\n but reveived {}". + np.testing.assert_array_equal( + value.grad.numpy(), + value_grad, + err_msg='The gradient of input should be \n{},\n but reveived {}'. format(value_grad, value.grad.numpy())) # case 6: pass stop_gradient from value to x @@ -1472,7 +1482,7 @@ def test_inplace(self): b[paddle.to_tensor(0)] = 1.0 self.assertTrue(id(b) == id(c)) - self.assertTrue(np.array_equal(b.numpy(), c.numpy())) + np.testing.assert_array_equal(b.numpy(), c.numpy()) self.assertEqual(b.inplace_version, 1) paddle.enable_static() @@ -1510,8 +1520,8 @@ def test_inplace_var_become_leaf_var(self): a_grad_2 = a.grad.numpy() b_grad_2 = b.grad.numpy() - self.assertTrue(np.array_equal(a_grad_1, a_grad_2)) - self.assertTrue(np.array_equal(b_grad_1, b_grad_2)) + np.testing.assert_array_equal(a_grad_1, a_grad_2) + np.testing.assert_array_equal(b_grad_1, b_grad_2) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_shuffle_batch_op.py b/python/paddle/fluid/tests/unittests/test_shuffle_batch_op.py index 6292a4d2b517d..962acad43edb9 100644 --- a/python/paddle/fluid/tests/unittests/test_shuffle_batch_op.py +++ b/python/paddle/fluid/tests/unittests/test_shuffle_batch_op.py @@ -68,7 +68,7 @@ def verify_output(self, outs): assert y is not None sort_x = self.sort_array(x) sort_y = self.sort_array(y) - self.assertTrue(np.array_equal(sort_x, sort_y)) + np.testing.assert_array_equal(sort_x, sort_y) def sort_array(self, array): shape = array.shape diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py index cdffb6fb37fb8..ddf0af21cddc0 100644 --- a/python/paddle/fluid/tests/unittests/test_slice_op.py +++ b/python/paddle/fluid/tests/unittests/test_slice_op.py @@ -600,7 +600,7 @@ def test_starts_ends_is_tensor(self): ends=paddle.to_tensor(ends, dtype='int32')) a_2 = paddle.slice(a, axes=axes, starts=starts, ends=ends) - self.assertTrue(np.array_equal(a_1.numpy(), a_2.numpy())) + np.testing.assert_array_equal(a_1.numpy(), a_2.numpy()) def test_bool_tensor(self): with paddle.fluid.dygraph.guard(): @@ -616,7 +616,7 @@ def test_bool_tensor(self): y_np = tt[0:3, 1:5, 2:4] self.assertTrue(paddle.bool == y_paddle.dtype) - self.assertTrue(np.array_equal(y_paddle.numpy(), y_np)) + np.testing.assert_array_equal(y_paddle.numpy(), y_np) class TestSliceApiEager(unittest.TestCase): @@ -635,11 +635,11 @@ def test_slice_api(self): axes=axes, starts=paddle.to_tensor(starts), ends=paddle.to_tensor(ends)) - + np.testing.assert_array_equal(a_1.numpy(), a_2.numpy()) a_1.backward() grad_truth = paddle.zeros_like(a) grad_truth[-3:3, 0:2, 2:4] = 1 - self.assertTrue(np.array_equal(grad_truth, a.gradient())) + np.testing.assert_array_equal(grad_truth, a.gradient()) self.assertTrue(np.allclose(a_1.numpy(), a[-3:3, 0:2, 2:4])) @@ -710,10 +710,10 @@ def test_case_1(self): self.assertTrue(self.sliced_arr.type == core.VarDesc.VarType.LOD_TENSOR) self.assertEqual(self.sliced_arr.shape, self.shape) - self.assertTrue(np.array_equal(self.out, self.data)) - self.assertTrue(np.array_equal(self.g_x0, np.ones_like(self.data))) - self.assertTrue(np.array_equal(self.g_x1, np.zeros_like(self.data))) - self.assertTrue(np.array_equal(self.g_x2, np.zeros_like(self.data))) + np.testing.assert_array_equal(self.out, self.data) + np.testing.assert_array_equal(self.g_x0, np.ones_like(self.data)) + np.testing.assert_array_equal(self.g_x1, np.zeros_like(self.data)) + np.testing.assert_array_equal(self.g_x2, np.zeros_like(self.data)) def test_case_2(self): main_program = fluid.Program() @@ -722,12 +722,11 @@ def test_case_2(self): self.assertTrue( self.sliced_arr.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY) self.assertEqual(self.sliced_arr.shape, self.shape) - self.assertTrue( - np.array_equal(self.out, - np.stack([self.data, self.data], axis=self.axis))) - self.assertTrue(np.array_equal(self.g_x0, np.ones_like(self.data))) - self.assertTrue(np.array_equal(self.g_x1, np.ones_like(self.data))) - self.assertTrue(np.array_equal(self.g_x2, np.zeros_like(self.data))) + np.testing.assert_array_equal( + self.out, np.stack([self.data, self.data], axis=self.axis)) + np.testing.assert_array_equal(self.g_x0, np.ones_like(self.data)) + np.testing.assert_array_equal(self.g_x1, np.ones_like(self.data)) + np.testing.assert_array_equal(self.g_x2, np.zeros_like(self.data)) def test_case_3(self): main_program = fluid.Program() @@ -736,13 +735,12 @@ def test_case_3(self): self.assertTrue( self.sliced_arr.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY) self.assertEqual(self.sliced_arr.shape, self.shape) - self.assertTrue( - np.array_equal( - self.out, - np.stack([self.data, self.data, self.data], axis=self.axis))) - self.assertTrue(np.array_equal(self.g_x0, np.ones_like(self.data))) - self.assertTrue(np.array_equal(self.g_x1, np.ones_like(self.data))) - self.assertTrue(np.array_equal(self.g_x2, np.ones_like(self.data))) + np.testing.assert_array_equal( + self.out, np.stack([self.data, self.data, self.data], + axis=self.axis)) + np.testing.assert_array_equal(self.g_x0, np.ones_like(self.data)) + np.testing.assert_array_equal(self.g_x1, np.ones_like(self.data)) + np.testing.assert_array_equal(self.g_x2, np.ones_like(self.data)) class TestImperativeVarBaseGetItem(unittest.TestCase): @@ -796,11 +794,11 @@ def test_axis_less_than_zero(self): 100, ], [0], [1]) np_slice = x_arr[:, :, 0:1] - self.assertTrue(np.array_equal(pp_slice, np_slice)) + np.testing.assert_array_equal(pp_slice, np_slice) pp_slice = paddle.slice(x, (-100, ), [0], [1]) np_slice = x_arr[0:1] - self.assertTrue(np.array_equal(pp_slice, np_slice)) + np.testing.assert_array_equal(pp_slice, np_slice) x_arr = np.array([], dtype=np.float32) x = paddle.to_tensor(np.reshape(x_arr, (0, 0, 0))) diff --git a/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py index d67fe0b7d5a03..8ac4d777317ce 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py @@ -160,7 +160,7 @@ def test_sparse_div_scalar(self): def test_sparse_cast(self): self.compare_with_dense_two_attr(paddle.cast, - paddle.incubate.sparse.cast, 'int16', + paddle.incubate.sparse.cast, 'int32', 'float32') self.compare_with_dense_two_attr(paddle.cast, paddle.incubate.sparse.cast, 'int32', diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py index e3f72d7b41ca2..2c71ada56129f 100644 --- a/python/paddle/fluid/tests/unittests/test_split_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_op.py @@ -503,6 +503,21 @@ def test_axis_tensor_input(self): self.assertTrue(np.allclose(ex_x1, x1_out)) self.assertTrue(np.allclose(ex_x2, x2_out)) + def func_negative_one_section(self): + with fluid.dygraph.guard(): + input_1 = np.random.random([4, 6, 6]).astype("int32") + # input is a variable which shape is [4, 6, 6] + input = paddle.to_tensor(input_1) + num1 = paddle.full(shape=[1], fill_value=1, dtype='int32') + x0 = paddle.split(input, num_or_sections=[-1], axis=num1) + x0_out = x0[0].numpy() + np.testing.assert_array_equal(x0_out, input.numpy()) + + def test_negative_one_section(self): + with _test_eager_guard(): + self.func_negative_one_section() + self.func_negative_one_section() + class API_TestEmptySplit(unittest.TestCase): @@ -525,4 +540,5 @@ def test_axis_input_empty_section(self): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_split_program.py b/python/paddle/fluid/tests/unittests/test_split_program.py index ff8348eb71913..7ed33ebde4c78 100644 --- a/python/paddle/fluid/tests/unittests/test_split_program.py +++ b/python/paddle/fluid/tests/unittests/test_split_program.py @@ -66,8 +66,10 @@ def test_split_program(self): self.assertEqual(len(vars_actual), len(vars_expected)) for actual, expected in zip(vars_actual, vars_expected): self.assertEqual(actual.shape, expected.shape) - self.assertTrue(np.array_equal(actual, expected), - '{}\n{}\n'.format(actual, expected)) + np.testing.assert_array_equal(actual, + expected, + err_msg='{}\n{}\n'.format( + actual, expected)) def get_places(self): places = [paddle.CPUPlace()] diff --git a/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py b/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py index 1c28393f3306e..8c4131d71d081 100644 --- a/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py @@ -62,7 +62,7 @@ def check_place(self, place): x = paddle.to_tensor(x_np) y1 = _C_ops.squared_l2_norm(x) y2 = _C_ops.squared_l2_norm(x) - self.assertTrue(np.array_equal(y1.numpy(), y2.numpy())) + np.testing.assert_array_equal(y1.numpy(), y2.numpy()) def test_main(self): self.check_place(paddle.CPUPlace()) diff --git a/python/paddle/fluid/tests/unittests/test_stack_op.py b/python/paddle/fluid/tests/unittests/test_stack_op.py index 6f4e490be6bfe..5bd9e2634a0cb 100644 --- a/python/paddle/fluid/tests/unittests/test_stack_op.py +++ b/python/paddle/fluid/tests/unittests/test_stack_op.py @@ -174,9 +174,8 @@ def test_case(self): self.assertTrue(self.out_var.shape[self.axis] == -1) exe = fluid.Executor(self.place) res = exe.run(self.program, fetch_list=self.out_var) - self.assertTrue( - np.array_equal(res[0], - np.stack([self.x] * self.iter_num, axis=self.axis))) + np.testing.assert_array_equal( + res[0], np.stack([self.x] * self.iter_num, axis=self.axis)) class TestTensorStackAPIWithLoDTensorArray(unittest.TestCase): @@ -209,9 +208,8 @@ def test_case(self): self.assertTrue(self.out_var.shape[self.axis] == -1) exe = fluid.Executor(self.place) res = exe.run(self.program, fetch_list=self.out_var) - self.assertTrue( - np.array_equal(res[0], - np.stack([self.x] * self.iter_num, axis=self.axis))) + np.testing.assert_array_equal( + res[0], np.stack([self.x] * self.iter_num, axis=self.axis)) class API_test(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py index 8a4f8f9201317..2a30088a001ff 100644 --- a/python/paddle/fluid/tests/unittests/test_static_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py @@ -332,7 +332,7 @@ def test_ptb_rnn_cpu_float32(self): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) temp_dir.cleanup() @@ -450,7 +450,7 @@ def test_ptb_rnn_cpu_float32(self): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) fluid.load(test_program, os.path.join(temp_dir.name, "test_1.pdmodel"), None) temp_dir.cleanup() @@ -561,7 +561,7 @@ def test_ptb_rnn_cpu_float32(self): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) temp_dir.cleanup() @@ -691,7 +691,7 @@ def test_ptb_rnn_cpu_float32(self): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) # check 1 for var in main_program.list_vars(): @@ -711,7 +711,7 @@ def test_ptb_rnn_cpu_float32(self): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) # check 2 for var in main_program.list_vars(): @@ -731,7 +731,7 @@ def test_ptb_rnn_cpu_float32(self): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) # check 3 for var in main_program.list_vars(): @@ -751,7 +751,7 @@ def test_ptb_rnn_cpu_float32(self): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) temp_dir.cleanup() @@ -840,7 +840,7 @@ def set_var(var, ndarray): new_t = np.array(new_scope.find_var(var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) temp_dir.cleanup() @@ -966,7 +966,7 @@ def test_load_from_old_interface(self): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) for var in main_program.list_vars(): if isinstance(var, framework.Parameter) or var.persistable: @@ -1096,7 +1096,7 @@ def test_load_from_old_interface_var_list(self): if var.name in var_list_names: # loaded vars base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) else: #not loaded vars self.assertTrue(np.sum(np.abs(new_t)) == 0) @@ -1213,7 +1213,7 @@ def test_load_from_old_interface(self): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) # test exception # change shape @@ -1400,7 +1400,7 @@ def symlink_force(target, link_name): with fluid.dygraph.guard(place): load_state = fluid.load_program_state(save_dir) for k, v in load_state.items(): - self.assertTrue(np.array_equal(base_map[k], v)) + np.testing.assert_array_equal(base_map[k], v) def create_symlink(self, target, link_name): try: @@ -1416,7 +1416,7 @@ def check_in_static(self, main_program, base_map): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) class TestProgramStateOldSaveSingleModel(unittest.TestCase): @@ -1540,7 +1540,7 @@ def test_ptb_rnn_cpu_float32(self): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) with self.assertRaises(ValueError): fluid.load_program_state(os.path.join(save_dir, "model_1")) @@ -1623,7 +1623,36 @@ def test_pickle_protocol(self): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) + + +class TestSaveLoadInferenceModel(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + self.model_path = os.path.join(self.temp_dir.name, 'no_params') + + def tearDown(self): + self.temp_dir.cleanup() + + def test_no_params(self): + main_program = framework.Program() + with framework.program_guard(main_program): + x = paddle.static.data(name="x", shape=[10, 10], dtype='float32') + y = x + x + + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + + paddle.static.save_inference_model(self.model_path, [x], [y], exe) + + [inference_program, feed_target_names, fetch_targets + ] = (paddle.static.load_inference_model(self.model_path, exe)) + + self.assertEqual(feed_target_names, ['x']) + self.assertEqual(fetch_targets[0].shape, (10, 10)) + ops = [op.type for op in inference_program.block(0).ops] + self.assertEqual(ops, ['feed', 'elementwise_add', 'scale', 'fetch']) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py b/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py index 6da849a44bdf4..dc46578b04906 100644 --- a/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py +++ b/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py @@ -141,7 +141,7 @@ def test_ptb_rnn_cpu_bfloat16(self): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load_large.py b/python/paddle/fluid/tests/unittests/test_static_save_load_large.py index e45cd59b444b9..0a417df56c371 100644 --- a/python/paddle/fluid/tests/unittests/test_static_save_load_large.py +++ b/python/paddle/fluid/tests/unittests/test_static_save_load_large.py @@ -75,7 +75,7 @@ def test_large_parameters_static_save(self): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) # set var to zero for var in prog.list_vars(): @@ -94,7 +94,7 @@ def test_large_parameters_static_save(self): new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] - self.assertTrue(np.array_equal(new_t, base_t)) + np.testing.assert_array_equal(new_t, base_t) temp_dir.cleanup() diff --git a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py index e8d42a2fae8c8..b68fbd9468c07 100644 --- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py +++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py @@ -748,10 +748,11 @@ def create_case(self, net): net.clear_all_grad() # compare result of dygraph and static self.is_grads_equal(grads_static, grads_dy) - self.assertTrue( - np.array_equal(s1, s2), - msg="dygraph graph result:\n{} \nstatic dygraph result:\n{}".format( - l1.numpy(), l2.numpy())) + np.testing.assert_array_equal( + s1, + s2, + err_msg='dygraph graph result:\n{} \nstatic dygraph result:\n{}'. + format(l1.numpy(), l2.numpy())) def test_strided_slice_tensor_array_cuda_pinned_place(self): if paddle.device.is_compiled_with_cuda(): diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py index ad226878f7ef1..2d2bc8487cac7 100644 --- a/python/paddle/fluid/tests/unittests/test_sum_op.py +++ b/python/paddle/fluid/tests/unittests/test_sum_op.py @@ -112,11 +112,9 @@ def check_input_and_optput(self, if has_data_w_num > 0: self.assertEqual(len(out.rows()), 7) - self.assertTrue( - np.array_equal( - np.array(out.get_tensor()), - self._get_array(self.rows, self.row_numel) * - has_data_w_num)) + np.testing.assert_array_equal( + np.array(out.get_tensor()), + self._get_array(self.rows, self.row_numel) * has_data_w_num) else: self.assertEqual(len(out.rows()), 0) @@ -252,13 +250,10 @@ def check_with_place(self, place, inplace): out_t = np.array(out) self.assertEqual(out_t.shape[0], self.height) - self.assertTrue( - np.array_equal( - out_t, - self._get_array([i - for i in range(self.height)], self.row_numel) * - np.tile( - np.array(result).reshape(self.height, 1), self.row_numel))) + np.testing.assert_array_equal( + out_t, + self._get_array([i for i in range(self.height)], self.row_numel) * + np.tile(np.array(result).reshape(self.height, 1), self.row_numel)) def create_lod_tensor(self, scope, place, var_name): var = scope.var(var_name) diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py index 2ea88c89a37ac..94db0cfe067d1 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_tensor.py @@ -17,7 +17,7 @@ import paddle.fluid as fluid import paddle.fluid.core as core import unittest -import numpy +import numpy as np import numbers @@ -25,7 +25,7 @@ class TestTensorPtr(unittest.TestCase): def test_tensor_ptr(self): t = core.Tensor() - np_arr = numpy.zeros([2, 3]) + np_arr = np.zeros([2, 3]) t.set(np_arr, core.CPUPlace()) self.assertGreater(t._ptr(), 0) @@ -47,13 +47,13 @@ def test_int_tensor(self): tensor._set_dims([1000, 784]) tensor._alloc_int(place) - tensor_array = numpy.array(tensor) + tensor_array = np.array(tensor) self.assertEqual((1000, 784), tensor_array.shape) tensor_array[3, 9] = 1 tensor_array[19, 11] = 2 tensor.set(tensor_array, place) - tensor_array_2 = numpy.array(tensor) + tensor_array_2 = np.array(tensor) self.assertEqual(1, tensor_array_2[3, 9]) self.assertEqual(2, tensor_array_2[19, 11]) @@ -67,13 +67,13 @@ def test_float_tensor(self): tensor._set_dims([1000, 784]) tensor._alloc_float(place) - tensor_array = numpy.array(tensor) + tensor_array = np.array(tensor) self.assertEqual((1000, 784), tensor_array.shape) tensor_array[3, 9] = 1.0 tensor_array[19, 11] = 2.0 tensor.set(tensor_array, place) - tensor_array_2 = numpy.array(tensor) + tensor_array_2 = np.array(tensor) self.assertAlmostEqual(1.0, tensor_array_2[3, 9]) self.assertAlmostEqual(2.0, tensor_array_2[19, 11]) @@ -81,24 +81,24 @@ def test_int8_tensor(self): scope = core.Scope() var = scope.var("int8_tensor") cpu_tensor = var.get_tensor() - tensor_array = numpy.random.randint(-127, - high=128, - size=[100, 200], - dtype=numpy.int8) + tensor_array = np.random.randint(-127, + high=128, + size=[100, 200], + dtype=np.int8) place = core.CPUPlace() cpu_tensor.set(tensor_array, place) - cpu_tensor_array_2 = numpy.array(cpu_tensor) + cpu_tensor_array_2 = np.array(cpu_tensor) self.assertAlmostEqual(cpu_tensor_array_2.all(), tensor_array.all()) if core.is_compiled_with_cuda(): cuda_tensor = var.get_tensor() - tensor_array = numpy.random.randint(-127, - high=128, - size=[100, 200], - dtype=numpy.int8) + tensor_array = np.random.randint(-127, + high=128, + size=[100, 200], + dtype=np.int8) place = core.CUDAPlace(0) cuda_tensor.set(tensor_array, place) - cuda_tensor_array_2 = numpy.array(cuda_tensor) + cuda_tensor_array_2 = np.array(cuda_tensor) self.assertAlmostEqual(cuda_tensor_array_2.all(), tensor_array.all()) @@ -110,14 +110,14 @@ def test_int_lod_tensor(self): lod_tensor._set_dims([4, 4, 6]) lod_tensor._alloc_int(place) - array = numpy.array(lod_tensor) + array = np.array(lod_tensor) array[0, 0, 0] = 3 array[3, 3, 5] = 10 lod_tensor.set(array, place) lod_tensor.set_recursive_sequence_lengths([[2, 2]]) - lod_v = numpy.array(lod_tensor) - self.assertTrue(numpy.alltrue(array == lod_v)) + lod_v = np.array(lod_tensor) + self.assertTrue(np.alltrue(array == lod_v)) lod = lod_tensor.recursive_sequence_lengths() self.assertEqual(2, lod[0][0]) @@ -132,13 +132,13 @@ def test_float_lod_tensor(self): lod_tensor._set_dims([5, 2, 3, 4]) lod_tensor._alloc_float(place) - tensor_array = numpy.array(lod_tensor) + tensor_array = np.array(lod_tensor) self.assertEqual((5, 2, 3, 4), tensor_array.shape) tensor_array[0, 0, 0, 0] = 1.0 tensor_array[0, 0, 0, 1] = 2.0 lod_tensor.set(tensor_array, place) - lod_v = numpy.array(lod_tensor) + lod_v = np.array(lod_tensor) self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0]) self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1]) self.assertEqual(len(lod_tensor.recursive_sequence_lengths()), 0) @@ -156,12 +156,12 @@ def test_lod_tensor_init(self): lod_tensor._set_dims([5, 2, 3, 4]) lod_tensor.set_recursive_sequence_lengths(lod_py) lod_tensor._alloc_float(place) - tensor_array = numpy.array(lod_tensor) + tensor_array = np.array(lod_tensor) tensor_array[0, 0, 0, 0] = 1.0 tensor_array[0, 0, 0, 1] = 2.0 lod_tensor.set(tensor_array, place) - lod_v = numpy.array(lod_tensor) + lod_v = np.array(lod_tensor) self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0]) self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1]) self.assertListEqual(lod_py, lod_tensor.recursive_sequence_lengths()) @@ -176,12 +176,12 @@ def test_lod_tensor_gpu_init(self): lod_tensor._set_dims([5, 2, 3, 4]) lod_tensor.set_recursive_sequence_lengths(lod_py) lod_tensor._alloc_float(place) - tensor_array = numpy.array(lod_tensor) + tensor_array = np.array(lod_tensor) tensor_array[0, 0, 0, 0] = 1.0 tensor_array[0, 0, 0, 1] = 2.0 lod_tensor.set(tensor_array, place) - lod_v = numpy.array(lod_tensor) + lod_v = np.array(lod_tensor) self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0]) self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1]) self.assertListEqual(lod_py, lod_tensor.recursive_sequence_lengths()) @@ -195,13 +195,13 @@ def test_empty_tensor(self): tensor._set_dims([0, 1]) tensor._alloc_float(place) - tensor_array = numpy.array(tensor) + tensor_array = np.array(tensor) self.assertEqual((0, 1), tensor_array.shape) if core.is_compiled_with_cuda(): gpu_place = core.CUDAPlace(0) tensor._alloc_float(gpu_place) - tensor_array = numpy.array(tensor) + tensor_array = np.array(tensor) self.assertEqual((0, 1), tensor_array.shape) def run_slice_tensor(self, place, dtype): @@ -209,43 +209,43 @@ def run_slice_tensor(self, place, dtype): shape = [3, 3, 3] tensor._set_dims(shape) - tensor_array = numpy.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]], - [[10, 11, 12], [13, 14, 15], [16, 17, 18]], - [[19, 20, 21], [22, 23, 24], - [25, 26, 27]]]).astype(dtype) + tensor_array = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]], + [[10, 11, 12], [13, 14, 15], [16, 17, 18]], + [[19, 20, 21], [22, 23, 24], + [25, 26, 27]]]).astype(dtype) tensor.set(tensor_array, place) n1 = tensor[1] t1 = tensor_array[1] - self.assertTrue((numpy.array(n1) == numpy.array(t1)).all()) + self.assertTrue((np.array(n1) == np.array(t1)).all()) n2 = tensor[1:] t2 = tensor_array[1:] - self.assertTrue((numpy.array(n2) == numpy.array(t2)).all()) + self.assertTrue((np.array(n2) == np.array(t2)).all()) n3 = tensor[0:2:] t3 = tensor_array[0:2:] - self.assertTrue((numpy.array(n3) == numpy.array(t3)).all()) + self.assertTrue((np.array(n3) == np.array(t3)).all()) n4 = tensor[2::-2] t4 = tensor_array[2::-2] - self.assertTrue((numpy.array(n4) == numpy.array(t4)).all()) + self.assertTrue((np.array(n4) == np.array(t4)).all()) n5 = tensor[2::-2][0] t5 = tensor_array[2::-2][0] - self.assertTrue((numpy.array(n5) == numpy.array(t5)).all()) + self.assertTrue((np.array(n5) == np.array(t5)).all()) n6 = tensor[2:-1:-1] t6 = tensor_array[2:-1:-1] - self.assertTrue((numpy.array(n6) == numpy.array(t6)).all()) + self.assertTrue((np.array(n6) == np.array(t6)).all()) n7 = tensor[0:, 0:] t7 = tensor_array[0:, 0:] - self.assertTrue((numpy.array(n7) == numpy.array(t7)).all()) + self.assertTrue((np.array(n7) == np.array(t7)).all()) n8 = tensor[0::1, 0::-1, 2:] t8 = tensor_array[0::1, 0::-1, 2:] - self.assertTrue((numpy.array(n8) == numpy.array(t8)).all()) + self.assertTrue((np.array(n8) == np.array(t8)).all()) def test_slice_tensor(self): for dtype in self.support_dtypes: @@ -264,7 +264,7 @@ def test_print_tensor(self): tensor = var.get_tensor() tensor._set_dims([10, 10]) tensor._alloc_int(place) - tensor_array = numpy.array(tensor) + tensor_array = np.array(tensor) self.assertEqual((10, 10), tensor_array.shape) tensor_array[0, 0] = 1 tensor_array[2, 2] = 2 @@ -302,62 +302,62 @@ def test_tensor_poiter(self): numbers.Integral)) def test_tensor_set_fp16(self): - array = numpy.random.random((300, 500)).astype("float16") + array = np.random.random((300, 500)).astype("float16") tensor = fluid.Tensor() place = core.CPUPlace() tensor.set(array, place) self.assertEqual(tensor._dtype(), core.VarDesc.VarType.FP16) - self.assertTrue(numpy.array_equal(numpy.array(tensor), array)) + np.testing.assert_array_equal(np.array(tensor), array) if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) tensor.set(array, place) self.assertEqual(tensor._dtype(), core.VarDesc.VarType.FP16) - self.assertTrue(numpy.array_equal(numpy.array(tensor), array)) + np.testing.assert_array_equal(np.array(tensor), array) place = core.CUDAPinnedPlace() tensor.set(array, place) self.assertEqual(tensor._dtype(), core.VarDesc.VarType.FP16) - self.assertTrue(numpy.array_equal(numpy.array(tensor), array)) + np.testing.assert_array_equal(np.array(tensor), array) def test_tensor_set_int16(self): - array = numpy.random.randint(100, size=(300, 500)).astype("int16") + array = np.random.randint(100, size=(300, 500)).astype("int16") tensor = fluid.Tensor() place = core.CPUPlace() tensor.set(array, place) self.assertEqual(tensor._dtype(), core.VarDesc.VarType.INT16) - self.assertTrue(numpy.array_equal(numpy.array(tensor), array)) + np.testing.assert_array_equal(np.array(tensor), array) if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) tensor.set(array, place) self.assertEqual(tensor._dtype(), core.VarDesc.VarType.INT16) - self.assertTrue(numpy.array_equal(numpy.array(tensor), array)) + np.testing.assert_array_equal(np.array(tensor), array) place = core.CUDAPinnedPlace() tensor.set(array, place) self.assertEqual(tensor._dtype(), core.VarDesc.VarType.INT16) - self.assertTrue(numpy.array_equal(numpy.array(tensor), array)) + np.testing.assert_array_equal(np.array(tensor), array) def test_tensor_set_from_array_list(self): - array = numpy.random.randint(1000, size=(200, 300)) + array = np.random.randint(1000, size=(200, 300)) list_array = [array, array] tensor = fluid.Tensor() place = core.CPUPlace() tensor.set(list_array, place) self.assertEqual([2, 200, 300], tensor.shape()) - self.assertTrue(numpy.array_equal(numpy.array(tensor), list_array)) + np.testing.assert_array_equal(np.array(tensor), list_array) if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) tensor.set(list_array, place) self.assertEqual([2, 200, 300], tensor.shape()) - self.assertTrue(numpy.array_equal(numpy.array(tensor), list_array)) + np.testing.assert_array_equal(np.array(tensor), list_array) place = core.CUDAPinnedPlace() tensor.set(list_array, place) self.assertEqual([2, 200, 300], tensor.shape()) - self.assertTrue(numpy.array_equal(numpy.array(tensor), list_array)) + np.testing.assert_array_equal(np.array(tensor), list_array) def test_tensor_set_error(self): scope = core.Scope() diff --git a/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py index d9c4d2c61b266..6eef408f5e37a 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py @@ -15,7 +15,7 @@ from __future__ import print_function import unittest -import numpy +import numpy as np import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.op import Operator @@ -28,7 +28,7 @@ class TestTensorArrayToTensorError(unittest.TestCase): def test_errors(self): with program_guard(Program()): - input_data = numpy.random.random((2, 4)).astype("float32") + input_data = np.random.random((2, 4)).astype("float32") def test_Variable(): fluid.layers.tensor_array_to_tensor(input=input_data) @@ -65,14 +65,14 @@ def test_get_set(self): for i in range(10): t = core.LoDTensor() if i == 0: - t.set(numpy.array([[i], [i]], dtype='float32'), cpu) + t.set(np.array([[i], [i]], dtype='float32'), cpu) else: - t.set(numpy.array([[i]], dtype='float32'), cpu) + t.set(np.array([[i]], dtype='float32'), cpu) input_tensor_array.append(t) self.assertEqual(10, len(input_tensor_array)) - random_grad = numpy.random.random_sample([11]).astype(numpy.float32) + random_grad = np.random.random_sample([11]).astype(np.float32) y_out = block.create_var(name="Out") y_out.persistable = True @@ -119,13 +119,13 @@ def test_get_set(self): exe = fluid.Executor(fluid.CPUPlace()) out = exe.run(program, fetch_list=fetch_list, scope=scope) - #print ("index: ", numpy.array(out[1])) + #print ("index: ", np.array(out[1])) # test forward - tensor_res = numpy.array(out[0]) - tensor_res_out_idx = numpy.array(out[1]) - tensor_gt = numpy.array([0] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], - dtype='float32') + tensor_res = np.array(out[0]) + tensor_res_out_idx = np.array(out[1]) + tensor_gt = np.array([0] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + dtype='float32') self.assertEqual(len(tensor_res), len(tensor_gt)) self.assertEqual(len(tensor_res_out_idx), 10) @@ -148,14 +148,13 @@ def test_get_set(self): for i in range(len(grad_tensor_array)): if i == 0: self.assertEqual( - numpy.array(grad_tensor_array[i])[0], - numpy.array(random_grad[i])) + np.array(grad_tensor_array[i])[0], np.array(random_grad[i])) self.assertEqual( - numpy.array(grad_tensor_array[i])[1], - numpy.array(random_grad[i + 1])) + np.array(grad_tensor_array[i])[1], + np.array(random_grad[i + 1])) if i == 1: - self.assertEqual(numpy.array(grad_tensor_array[i]), - numpy.array(random_grad[i + 1])) + self.assertEqual(np.array(grad_tensor_array[i]), + np.array(random_grad[i + 1])) class TestLoDTensorArrayStack(unittest.TestCase): @@ -165,16 +164,16 @@ def setUp(self): self.op_type = "tensor_array_to_tensor" self.attrs = {"axis": 1, "use_stack": True} self.inputs = [ - numpy.random.rand(2, 3, 4).astype("float32"), - numpy.random.rand(2, 3, 4).astype("float32"), - numpy.random.rand(2, 3, 4).astype("float32") + np.random.rand(2, 3, 4).astype("float32"), + np.random.rand(2, 3, 4).astype("float32"), + np.random.rand(2, 3, 4).astype("float32") ] self.outputs = [ - numpy.stack(self.inputs, axis=self.attrs["axis"]), - numpy.array([x.shape[self.attrs["axis"]] for x in self.inputs], - dtype="int32") + np.stack(self.inputs, axis=self.attrs["axis"]), + np.array([x.shape[self.attrs["axis"]] for x in self.inputs], + dtype="int32") ] - self.input_grads = [numpy.ones_like(x) for x in self.inputs] + self.input_grads = [np.ones_like(x) for x in self.inputs] self.set_program() for var in self.program.list_vars(): # to avoid scope clearing after execution @@ -197,15 +196,13 @@ def set_program(self): def run_check(self, executor, scope): executor.run(self.program, scope=scope) for i, output in enumerate(self.outputs): - numpy.allclose(numpy.array( + np.allclose(np.array( scope.var(self.output_vars[i].name).get_tensor()), - output, - atol=0) + output, + atol=0) tensor_array_grad = scope.var(self.array.name).get_lod_tensor_array() for i, input_grad in enumerate(self.input_grads): - numpy.allclose(numpy.array(tensor_array_grad[i]), - input_grad, - atol=0) + np.allclose(np.array(tensor_array_grad[i]), input_grad, atol=0) def test_cpu(self): scope = core.Scope() @@ -239,8 +236,8 @@ def _test_case(self, inp1, inp2): return output_stack, output_index_stack, output_concat, output_index_concat def test_case(self): - inp0 = numpy.random.rand(2, 3, 4).astype("float32") - inp1 = numpy.random.rand(2, 3, 4).astype("float32") + inp0 = np.random.rand(2, 3, 4).astype("float32") + inp1 = np.random.rand(2, 3, 4).astype("float32") _outs_static = self._test_case(inp0, inp1) place = fluid.CPUPlace() @@ -251,7 +248,7 @@ def test_case(self): outs_dynamic = self._test_case(inp0, inp1) for s, d in zip(outs_static, outs_dynamic): - self.assertTrue(numpy.array_equal(s, d.numpy())) + np.testing.assert_array_equal(s, d.numpy()) def test_while_loop_case(self): with fluid.dygraph.guard(): @@ -259,7 +256,7 @@ def test_while_loop_case(self): i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=1) ten = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10) array = fluid.layers.create_array(dtype='float32') - inp0 = numpy.random.rand(2, 3, 4).astype("float32") + inp0 = np.random.rand(2, 3, 4).astype("float32") x0 = fluid.layers.assign(inp0) fluid.layers.array_write(x0, zero, array) @@ -275,9 +272,8 @@ def body(i, end, array): self.assertTrue(fluid.layers.array_length(array), 10) last = fluid.layers.fill_constant(shape=[1], dtype='int64', value=9) - self.assertTrue( - numpy.array_equal( - fluid.layers.array_read(array, last).numpy(), inp0)) + np.testing.assert_array_equal( + fluid.layers.array_read(array, last).numpy(), inp0) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_tensor_copy_from.py b/python/paddle/fluid/tests/unittests/test_tensor_copy_from.py index 64c4be260ed69..6c38d2349905b 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_copy_from.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_copy_from.py @@ -25,15 +25,15 @@ def test_main(self): np_value = np.random.random(size=[10, 30]).astype('float32') t_src = Tensor() t_src.set(np_value, place) - self.assertTrue(np.array_equal(np_value, t_src)) + np.testing.assert_array_equal(np_value, t_src) t_dst1 = Tensor() t_dst1._copy_from(t_src, place) - self.assertTrue(np.array_equal(np_value, t_dst1)) + np.testing.assert_array_equal(np_value, t_dst1) t_dst2 = Tensor() t_dst2._copy_from(t_src, place, 5) - self.assertTrue(np.array_equal(np.array(np_value[0:5]), t_dst2)) + np.testing.assert_array_equal(np.array(np_value[0:5]), t_dst2) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor.py index e71cc3b7239f1..37fee3a380fbd 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor.py @@ -18,6 +18,7 @@ import numpy as np import six import paddle +from paddle.fluid.framework import _enable_legacy_dygraph class TensorFillDiagTensor_Test(unittest.TestCase): @@ -183,5 +184,9 @@ def test_largedim(self): fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) +class TensorFillDiagTensor_Test_legacy(TensorFillDiagTensor_Test): + _enable_legacy_dygraph() + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py index 9767fb25243f8..07f5b03698823 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py @@ -97,16 +97,16 @@ def run_double_hook_for_interior_var(double_hook, removed=False): o.backward() # z.grad is not affected - self.assertTrue(np.array_equal(z.grad.numpy(), w.numpy())) + np.testing.assert_array_equal(z.grad.numpy(), w.numpy()) # w.grad is not changed by hook - self.assertTrue(np.array_equal(w.grad.numpy(), z.numpy())) + np.testing.assert_array_equal(w.grad.numpy(), z.numpy()) # x.grad and y.grad are changed if run hook - self.assertTrue( - np.array_equal(x.grad.numpy(), - z.numpy() * 2 if not removed else z.numpy())) - self.assertTrue( - np.array_equal(y.grad.numpy(), - z.numpy() * 2 if not removed else z.numpy())) + np.testing.assert_array_equal( + x.grad.numpy(), + z.numpy() * 2 if not removed else z.numpy()) + np.testing.assert_array_equal( + y.grad.numpy(), + z.numpy() * 2 if not removed else z.numpy()) def run_print_hook_for_interior_var(print_hook, removed=False): for device in self.devices: @@ -133,10 +133,10 @@ def run_print_hook_for_interior_var(print_hook, removed=False): o.backward() # all grads are not affected - self.assertTrue(np.array_equal(z.grad.numpy(), w.numpy())) - self.assertTrue(np.array_equal(w.grad.numpy(), z.numpy())) - self.assertTrue(np.array_equal(x.grad.numpy(), z.numpy())) - self.assertTrue(np.array_equal(y.grad.numpy(), z.numpy())) + np.testing.assert_array_equal(z.grad.numpy(), w.numpy()) + np.testing.assert_array_equal(w.grad.numpy(), z.numpy()) + np.testing.assert_array_equal(x.grad.numpy(), z.numpy()) + np.testing.assert_array_equal(y.grad.numpy(), z.numpy()) def double_hook(grad): grad = grad * 2 @@ -195,13 +195,13 @@ def run_double_hook_for_leaf_var(double_hook, removed=False): o.backward() # z.grad, w.grad, x.grad is not affected - self.assertTrue(np.array_equal(z.grad.numpy(), w.numpy())) - self.assertTrue(np.array_equal(w.grad.numpy(), z.numpy())) - self.assertTrue(np.array_equal(x.grad.numpy(), z.numpy())) + np.testing.assert_array_equal(z.grad.numpy(), w.numpy()) + np.testing.assert_array_equal(w.grad.numpy(), z.numpy()) + np.testing.assert_array_equal(x.grad.numpy(), z.numpy()) # y.grad are changed if run hook - self.assertTrue( - np.array_equal(y.grad.numpy(), - z.numpy() * 2 if not removed else z.numpy())) + np.testing.assert_array_equal( + y.grad.numpy(), + z.numpy() * 2 if not removed else z.numpy()) # register hook run_double_hook_for_leaf_var(lambda grad: grad * 2) @@ -255,15 +255,13 @@ def run_double_hook_for_accumulated_grad_interior_var( base_grad = np.array([5., 9., 13., 19.]) # x.grad is not changed - self.assertTrue(np.array_equal(x.grad.numpy(), base_grad)) + np.testing.assert_array_equal(x.grad.numpy(), base_grad) # b.grad is changed by x.hook - self.assertTrue( - np.array_equal(b.grad.numpy(), - base_grad * 2 if not removed else base_grad)) + np.testing.assert_array_equal( + b.grad.numpy(), base_grad * 2 if not removed else base_grad) # a.grad is changed by x.hook and a.hook - self.assertTrue( - np.array_equal(a.grad.numpy(), - base_grad * 4 if not removed else base_grad)) + np.testing.assert_array_equal( + a.grad.numpy(), base_grad * 4 if not removed else base_grad) # register hook run_double_hook_for_accumulated_grad_interior_var(lambda grad: grad * 2) @@ -310,9 +308,8 @@ def run_double_hook_for_accumulated_grad_leaf_var( base_grad = np.array([5., 9., 13., 19.]) # x.grad is changed by x.hook - self.assertTrue( - np.array_equal(x.grad.numpy(), - base_grad * 2 if not removed else base_grad)) + np.testing.assert_array_equal( + x.grad.numpy(), base_grad * 2 if not removed else base_grad) # register hook run_double_hook_for_accumulated_grad_leaf_var(lambda grad: grad * 2) @@ -364,14 +361,14 @@ def run_double_hook_in_model(data, data, label, lambda grad: grad * 2, True, True) # compare original value and with hook - self.assertTrue(np.array_equal(ret1_grad, ret1_grad_hook)) - self.assertTrue(np.array_equal(linear1_w_grad * 2, linear1_w_grad_hook)) - self.assertTrue(np.array_equal(linear1_b_grad * 2, linear1_b_grad_hook)) + np.testing.assert_array_equal(ret1_grad, ret1_grad_hook) + np.testing.assert_array_equal(linear1_w_grad * 2, linear1_w_grad_hook) + np.testing.assert_array_equal(linear1_b_grad * 2, linear1_b_grad_hook) # compare original value and remove hook - self.assertTrue(np.array_equal(ret1_grad, ret1_grad_rm)) - self.assertTrue(np.array_equal(linear1_w_grad, linear1_w_grad_rm)) - self.assertTrue(np.array_equal(linear1_b_grad, linear1_b_grad_rm)) + np.testing.assert_array_equal(ret1_grad, ret1_grad_rm) + np.testing.assert_array_equal(linear1_w_grad, linear1_w_grad_rm) + np.testing.assert_array_equal(linear1_b_grad, linear1_b_grad_rm) def test_func_hook_in_model(self): fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) @@ -427,37 +424,37 @@ def double_hook(grad): z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var( device, hooks) - self.assertTrue(np.array_equal(w_grad, z)) - self.assertTrue(np.array_equal(x_grad, z * 8)) - self.assertTrue(np.array_equal(y_grad, z * 8)) + np.testing.assert_array_equal(w_grad, z) + np.testing.assert_array_equal(x_grad, z * 8) + np.testing.assert_array_equal(y_grad, z * 8) z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var( device, hooks, remove1=True) - self.assertTrue(np.array_equal(w_grad, z)) - self.assertTrue(np.array_equal(x_grad, z * 4)) - self.assertTrue(np.array_equal(y_grad, z * 4)) + np.testing.assert_array_equal(w_grad, z) + np.testing.assert_array_equal(x_grad, z * 4) + np.testing.assert_array_equal(y_grad, z * 4) z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var( device, hooks, remove2=True) - self.assertTrue(np.array_equal(w_grad, z)) - self.assertTrue(np.array_equal(x_grad, z * 4)) - self.assertTrue(np.array_equal(y_grad, z * 4)) + np.testing.assert_array_equal(w_grad, z) + np.testing.assert_array_equal(x_grad, z * 4) + np.testing.assert_array_equal(y_grad, z * 4) z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var( device, hooks, remove3=True) - self.assertTrue(np.array_equal(w_grad, z)) - self.assertTrue(np.array_equal(x_grad, z * 4)) - self.assertTrue(np.array_equal(y_grad, z * 4)) + np.testing.assert_array_equal(w_grad, z) + np.testing.assert_array_equal(x_grad, z * 4) + np.testing.assert_array_equal(y_grad, z * 4) z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var( device, hooks, remove1=True, remove2=True, remove3=True) - self.assertTrue(np.array_equal(w_grad, z)) - self.assertTrue(np.array_equal(x_grad, z)) - self.assertTrue(np.array_equal(y_grad, z)) + np.testing.assert_array_equal(w_grad, z) + np.testing.assert_array_equal(x_grad, z) + np.testing.assert_array_equal(y_grad, z) def test_multiple_hooks_for_interior_var(self): fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) @@ -502,7 +499,7 @@ def double_print_hook(grad): pass else: z.backward() - self.assertTrue(np.array_equal(x.grad.numpy(), np.array([8.]))) + np.testing.assert_array_equal(x.grad.numpy(), np.array([8.0])) def test_hook_in_double_grad(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py index ded9d42b9b5fe..64202ad377aca 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py @@ -50,7 +50,7 @@ def check_operation(self, a, b, c, op): raise ValueError("Unsupported operation.") self.assertEqual(c_rlt.dtype, c.dtype) - self.assertTrue(np.array_equal(c_rlt.numpy(), c.numpy())) + np.testing.assert_array_equal(c_rlt.numpy(), c.numpy()) def func_tensor_add_scalar(self): # tensor(int64) + scalar(int) diff --git a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py index 701ff5c3d6e9f..a030efd44880a 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py @@ -58,7 +58,7 @@ def check_operation(self, a, b, c, op): rlt = exe.run(fetch_list=[c_rlt.name, c.name]) self.assertEqual(rlt[0].dtype, rlt[1].dtype) - self.assertTrue(np.array_equal(rlt[0], rlt[1])) + np.testing.assert_array_equal(rlt[0], rlt[1]) def test_tensor_add_scalar(self): # tensor(int64) + scalar(int) diff --git a/python/paddle/fluid/tests/unittests/test_tensor_uva.py b/python/paddle/fluid/tests/unittests/test_tensor_uva.py index 8e62d04004170..a2f0bfa6515b6 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_uva.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_uva.py @@ -47,10 +47,15 @@ def func_uva_tensor_creation(self): data = np.random.randint(10, size=[4, 5]).astype(dtype) if _in_legacy_dygraph(): tensor = paddle.fluid.core.to_uva_tensor(data, 0) + tensor2 = paddle.fluid.core.to_uva_tensor(data) else: tensor = core.eager.to_uva_tensor(data, 0) + tensor2 = core.eager.to_uva_tensor(data) + self.assertTrue(tensor.place.is_gpu_place()) + self.assertTrue(tensor2.place.is_gpu_place()) self.assertTrue(np.allclose(tensor.numpy(), data)) + self.assertTrue(np.allclose(tensor2.numpy(), data)) def test_uva_tensor_creation(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/unittests/test_translated_layer.py b/python/paddle/fluid/tests/unittests/test_translated_layer.py index ba44c78f2c74d..f63e8d3eac0fd 100644 --- a/python/paddle/fluid/tests/unittests/test_translated_layer.py +++ b/python/paddle/fluid/tests/unittests/test_translated_layer.py @@ -132,7 +132,7 @@ def load_and_inference(self): translated_layer.eval() pred = translated_layer(x) - self.assertTrue(np.array_equal(orig_pred.numpy(), pred.numpy())) + np.testing.assert_array_equal(orig_pred.numpy(), pred.numpy()) def load_and_fine_tuning(self): # load @@ -148,9 +148,11 @@ def load_and_fine_tuning(self): parameters=translated_layer.parameters()) loss = train(translated_layer, self.loader, self.loss_fn, sgd) - self.assertTrue(np.array_equal(orig_loss.numpy(), loss.numpy()), - msg="original loss:\n{}\nnew loss:\n{}\n".format( - orig_loss.numpy(), loss.numpy())) + np.testing.assert_array_equal( + orig_loss.numpy(), + loss.numpy(), + err_msg='original loss:\n{}\nnew loss:\n{}\n'.format( + orig_loss.numpy(), loss.numpy())) def test_get_program(self): # load diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py index fb48f63185075..7f1794c39fcad 100644 --- a/python/paddle/fluid/tests/unittests/test_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py @@ -458,13 +458,13 @@ def test_moveaxis1(self): exe = paddle.static.Executor() out_np = exe.run(feed={"x": x_np}, fetch_list=[out])[0] - self.assertEqual(np.array_equal(out_np, expected), True) + np.testing.assert_array_equal(out_np, expected) paddle.disable_static() x = paddle.to_tensor(x_np) out = paddle.moveaxis(x, [0, 4, 3, 2], [1, 3, 2, 0]) self.assertEqual(out.shape, [4, 2, 5, 7, 3]) - self.assertEqual(np.array_equal(out.numpy(), expected), True) + np.testing.assert_array_equal(out.numpy(), expected) paddle.enable_static() def test_moveaxis2(self): @@ -478,13 +478,13 @@ def test_moveaxis2(self): exe = paddle.static.Executor() out_np = exe.run(feed={"x": x_np}, fetch_list=[out])[0] - self.assertEqual(np.array_equal(out_np, expected), True) + np.testing.assert_array_equal(out_np, expected) paddle.disable_static() x = paddle.to_tensor(x_np) out = x.moveaxis(-2, -1) self.assertEqual(out.shape, [2, 5, 3]) - self.assertEqual(np.array_equal(out.numpy(), expected), True) + np.testing.assert_array_equal(out.numpy(), expected) paddle.enable_static() def test_moveaxis3(self): diff --git a/python/paddle/fluid/tests/unittests/test_unbind_op.py b/python/paddle/fluid/tests/unittests/test_unbind_op.py index 5f8fb382eb935..58b943a2668b5 100644 --- a/python/paddle/fluid/tests/unittests/test_unbind_op.py +++ b/python/paddle/fluid/tests/unittests/test_unbind_op.py @@ -1,230 +1,230 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import unittest -import numpy as np -from op_test import OpTest, convert_float_to_uint16 -import paddle -import paddle.fluid as fluid -import paddle.tensor as tensor -from paddle.fluid import compiler, Program, program_guard, core -from paddle.fluid.framework import _test_eager_guard - - -class TestUnbind(unittest.TestCase): - - def test_unbind(self): - - x_1 = fluid.data(shape=[2, 3], dtype='float32', name='x_1') - [out_0, out_1] = tensor.unbind(input=x_1, axis=0) - input_1 = np.random.random([2, 3]).astype("float32") - axis = fluid.data(shape=[1], dtype='int32', name='axis') - exe = fluid.Executor(place=fluid.CPUPlace()) - - [res_1, res_2] = exe.run(fluid.default_main_program(), - feed={ - "x_1": input_1, - "axis": 0 - }, - fetch_list=[out_0, out_1]) - - assert np.array_equal(res_1, input_1[0, 0:100]) - assert np.array_equal(res_2, input_1[1, 0:100]) - - def test_unbind_dygraph(self): - with fluid.dygraph.guard(): - np_x = np.random.random([2, 3]).astype("float32") - x = paddle.to_tensor(np_x) - x.stop_gradient = False - [res_1, res_2] = paddle.unbind(x, 0) - self.assertTrue(np.array_equal(res_1, np_x[0, 0:100])) - self.assertTrue(np.array_equal(res_2, np_x[1, 0:100])) - - out = paddle.add_n([res_1, res_2]) - - np_grad = np.ones(x.shape, np.float32) - out.backward() - self.assertTrue(np.array_equal(x.grad.numpy(), np_grad)) - - def test_unbind_dygraph_final_state(self): - with _test_eager_guard(): - self.test_unbind_dygraph() - - -class TestLayersUnbind(unittest.TestCase): - - def test_layers_unbind(self): - - x_1 = fluid.data(shape=[2, 3], dtype='float32', name='x_1') - [out_0, out_1] = fluid.layers.unbind(input=x_1, axis=0) - input_1 = np.random.random([2, 3]).astype("float32") - axis = fluid.data(shape=[1], dtype='int32', name='axis') - exe = fluid.Executor(place=fluid.CPUPlace()) - - [res_1, res_2] = exe.run(fluid.default_main_program(), - feed={ - "x_1": input_1, - "axis": 0 - }, - fetch_list=[out_0, out_1]) - - assert np.array_equal(res_1, input_1[0, 0:100]) - assert np.array_equal(res_2, input_1[1, 0:100]) - - -class TestUnbindOp(OpTest): - - def initParameters(self): - pass - - def outReshape(self): - pass - - def setAxis(self): - pass - - def setUp(self): - self._set_op_type() - self.dtype = self.get_dtype() - self.axis = 0 - self.num = 3 - self.initParameters() - x = np.arange(12).reshape(3, 2, 2).astype(self.dtype) - self.out = np.split(x, self.num, self.axis) - self.outReshape() - self.inputs = {'X': x} - self.attrs = {'axis': self.axis} - self.setAxis() - self.outputs = {'Out': [('out%d' % i, self.out[i]) \ - for i in range(len(self.out))]} - - def get_dtype(self): - return "float64" - - def _set_op_type(self): - self.op_type = "unbind" - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X'], ['out0', 'out1', 'out2']) - - -class TestUnbindOp1(TestUnbindOp): - - def initParameters(self): - self.axis = 1 - self.num = 2 - - def test_check_grad(self): - self.check_grad(['X'], ['out0', 'out1']) - - def outReshape(self): - self.out[0] = self.out[0].reshape((3, 2)) - self.out[1] = self.out[1].reshape((3, 2)) - - -class TestUnbindOp2(TestUnbindOp): - - def initParameters(self): - self.axis = 2 - self.num = 2 - - def test_check_grad(self): - self.check_grad(['X'], ['out0', 'out1']) - - def outReshape(self): - self.out[0] = self.out[0].reshape((3, 2)) - self.out[1] = self.out[1].reshape((3, 2)) - - -class TestUnbindOp3(TestUnbindOp): - - def initParameters(self): - self.axis = 2 - self.num = 2 - - def setAxis(self): - self.attrs = {'axis': -1} - - def test_check_grad(self): - self.check_grad(['X'], ['out0', 'out1']) - - def outReshape(self): - self.out[0] = self.out[0].reshape((3, 2)) - self.out[1] = self.out[1].reshape((3, 2)) - - -class TestUnbindOp4(TestUnbindOp): - - def initParameters(self): - self.axis = 1 - self.num = 2 - - def setAxis(self): - self.attrs = {'axis': -2} - - def test_check_grad(self): - self.check_grad(['X'], ['out0', 'out1']) - - def outReshape(self): - self.out[0] = self.out[0].reshape((3, 2)) - self.out[1] = self.out[1].reshape((3, 2)) - - -class TestUnbindBF16Op(OpTest): - - def setUp(self): - self._set_op_type() - self.python_api = paddle.unbind - self.dtype = self.get_dtype() - self.axis = 0 - self.num = 3 - x = np.arange(12).reshape(3, 2, 2).astype(self.dtype) - self.out = np.split(x, self.num, self.axis) - self.inputs = {'X': convert_float_to_uint16(x)} - self.attrs = {'axis': self.axis} - self.outputs = {'Out': [('out%d' % i, convert_float_to_uint16(self.out[i])) \ - for i in range(len(self.out))]} - - def get_dtype(self): - return np.uint16 - - def _set_op_type(self): - self.op_type = "unbind" - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - pass - - -class TestUnbindAxisError(unittest.TestCase): - - def test_errors(self): - with program_guard(Program(), Program()): - x = fluid.data(shape=[2, 3], dtype='float32', name='x') - - def test_table_Variable(): - tensor.unbind(input=x, axis=2.0) - - self.assertRaises(TypeError, test_table_Variable) - - -if __name__ == '__main__': - unittest.main() +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest, convert_float_to_uint16 +import paddle +import paddle.fluid as fluid +import paddle.tensor as tensor +from paddle.fluid import compiler, Program, program_guard, core +from paddle.fluid.framework import _test_eager_guard + + +class TestUnbind(unittest.TestCase): + + def test_unbind(self): + + x_1 = fluid.data(shape=[2, 3], dtype='float32', name='x_1') + [out_0, out_1] = tensor.unbind(input=x_1, axis=0) + input_1 = np.random.random([2, 3]).astype("float32") + axis = fluid.data(shape=[1], dtype='int32', name='axis') + exe = fluid.Executor(place=fluid.CPUPlace()) + + [res_1, res_2] = exe.run(fluid.default_main_program(), + feed={ + "x_1": input_1, + "axis": 0 + }, + fetch_list=[out_0, out_1]) + + assert np.array_equal(res_1, input_1[0, 0:100]) + assert np.array_equal(res_2, input_1[1, 0:100]) + + def test_unbind_dygraph(self): + with fluid.dygraph.guard(): + np_x = np.random.random([2, 3]).astype("float32") + x = paddle.to_tensor(np_x) + x.stop_gradient = False + [res_1, res_2] = paddle.unbind(x, 0) + np.testing.assert_array_equal(res_1, np_x[0, 0:100]) + np.testing.assert_array_equal(res_2, np_x[1, 0:100]) + + out = paddle.add_n([res_1, res_2]) + + np_grad = np.ones(x.shape, np.float32) + out.backward() + np.testing.assert_array_equal(x.grad.numpy(), np_grad) + + def test_unbind_dygraph_final_state(self): + with _test_eager_guard(): + self.test_unbind_dygraph() + + +class TestLayersUnbind(unittest.TestCase): + + def test_layers_unbind(self): + + x_1 = fluid.data(shape=[2, 3], dtype='float32', name='x_1') + [out_0, out_1] = fluid.layers.unbind(input=x_1, axis=0) + input_1 = np.random.random([2, 3]).astype("float32") + axis = fluid.data(shape=[1], dtype='int32', name='axis') + exe = fluid.Executor(place=fluid.CPUPlace()) + + [res_1, res_2] = exe.run(fluid.default_main_program(), + feed={ + "x_1": input_1, + "axis": 0 + }, + fetch_list=[out_0, out_1]) + + assert np.array_equal(res_1, input_1[0, 0:100]) + assert np.array_equal(res_2, input_1[1, 0:100]) + + +class TestUnbindOp(OpTest): + + def initParameters(self): + pass + + def outReshape(self): + pass + + def setAxis(self): + pass + + def setUp(self): + self._set_op_type() + self.dtype = self.get_dtype() + self.axis = 0 + self.num = 3 + self.initParameters() + x = np.arange(12).reshape(3, 2, 2).astype(self.dtype) + self.out = np.split(x, self.num, self.axis) + self.outReshape() + self.inputs = {'X': x} + self.attrs = {'axis': self.axis} + self.setAxis() + self.outputs = {'Out': [('out%d' % i, self.out[i]) \ + for i in range(len(self.out))]} + + def get_dtype(self): + return "float64" + + def _set_op_type(self): + self.op_type = "unbind" + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], ['out0', 'out1', 'out2']) + + +class TestUnbindOp1(TestUnbindOp): + + def initParameters(self): + self.axis = 1 + self.num = 2 + + def test_check_grad(self): + self.check_grad(['X'], ['out0', 'out1']) + + def outReshape(self): + self.out[0] = self.out[0].reshape((3, 2)) + self.out[1] = self.out[1].reshape((3, 2)) + + +class TestUnbindOp2(TestUnbindOp): + + def initParameters(self): + self.axis = 2 + self.num = 2 + + def test_check_grad(self): + self.check_grad(['X'], ['out0', 'out1']) + + def outReshape(self): + self.out[0] = self.out[0].reshape((3, 2)) + self.out[1] = self.out[1].reshape((3, 2)) + + +class TestUnbindOp3(TestUnbindOp): + + def initParameters(self): + self.axis = 2 + self.num = 2 + + def setAxis(self): + self.attrs = {'axis': -1} + + def test_check_grad(self): + self.check_grad(['X'], ['out0', 'out1']) + + def outReshape(self): + self.out[0] = self.out[0].reshape((3, 2)) + self.out[1] = self.out[1].reshape((3, 2)) + + +class TestUnbindOp4(TestUnbindOp): + + def initParameters(self): + self.axis = 1 + self.num = 2 + + def setAxis(self): + self.attrs = {'axis': -2} + + def test_check_grad(self): + self.check_grad(['X'], ['out0', 'out1']) + + def outReshape(self): + self.out[0] = self.out[0].reshape((3, 2)) + self.out[1] = self.out[1].reshape((3, 2)) + + +class TestUnbindBF16Op(OpTest): + + def setUp(self): + self._set_op_type() + self.python_api = paddle.unbind + self.dtype = self.get_dtype() + self.axis = 0 + self.num = 3 + x = np.arange(12).reshape(3, 2, 2).astype(self.dtype) + self.out = np.split(x, self.num, self.axis) + self.inputs = {'X': convert_float_to_uint16(x)} + self.attrs = {'axis': self.axis} + self.outputs = {'Out': [('out%d' % i, convert_float_to_uint16(self.out[i])) \ + for i in range(len(self.out))]} + + def get_dtype(self): + return np.uint16 + + def _set_op_type(self): + self.op_type = "unbind" + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + pass + + +class TestUnbindAxisError(unittest.TestCase): + + def test_errors(self): + with program_guard(Program(), Program()): + x = fluid.data(shape=[2, 3], dtype='float32', name='x') + + def test_table_Variable(): + tensor.unbind(input=x, axis=2.0) + + self.assertRaises(TypeError, test_table_Variable) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_unpool3d_op.py b/python/paddle/fluid/tests/unittests/test_unpool3d_op.py index 1fbff100a3db5..ec04ca56a5441 100644 --- a/python/paddle/fluid/tests/unittests/test_unpool3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_unpool3d_op.py @@ -63,10 +63,30 @@ def unpool3dmax_forward_naive(input, indices, ksize, strides, paddings, return out +def max_unpool3d_wrapper(x, + indices, + kernel_size, + stride=None, + padding=0, + output_size=None, + data_format="NCDHW", + name=None): + out = paddle.nn.functional.max_unpool3d(x, + indices, + kernel_size, + stride=stride, + padding=padding, + data_format=data_format, + output_size=output_size, + name=name) + return out + + class TestUnpool3DOp(OpTest): def setUp(self): self.op_type = "unpool3d" + self.python_api = max_unpool3d_wrapper self.init_test_case() inputs = np.random.randint(0, 100, self.shape) nsize, csize, dsize, hsize, wsize = inputs.shape @@ -102,10 +122,10 @@ def setUp(self): self.outputs = {'Out': output.astype('float64')} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) def init_test_case(self): self.unpool3d_forward_naive = unpool3dmax_forward_naive diff --git a/python/paddle/fluid/tests/unittests/test_unpool_op.py b/python/paddle/fluid/tests/unittests/test_unpool_op.py index 1b6d3d9dfb732..07e44ce65d613 100644 --- a/python/paddle/fluid/tests/unittests/test_unpool_op.py +++ b/python/paddle/fluid/tests/unittests/test_unpool_op.py @@ -17,6 +17,7 @@ import unittest import numpy as np from op_test import OpTest +import paddle def _unpool_output_size(x, kernel_size, stride, padding, output_size): @@ -53,10 +54,30 @@ def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings, return out +def max_unpool2d_wrapper(x, + indices, + kernel_size, + stride=None, + padding=0, + output_size=None, + data_format="NCHW", + name=None): + out = paddle.nn.functional.max_unpool2d(x, + indices, + kernel_size, + stride=stride, + padding=padding, + data_format=data_format, + output_size=output_size, + name=name) + return out + + class TestUnpoolOp(OpTest): def setUp(self): self.op_type = "unpool" + self.python_api = max_unpool2d_wrapper self.init_test_case() input = np.random.randint(0, 100, self.shape) nsize, csize, hsize, wsize = input.shape @@ -91,10 +112,10 @@ def setUp(self): self.outputs = {'Out': output.astype('float64')} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) def init_test_case(self): self.unpool2d_forward_naive = unpool2dmax_forward_naive @@ -127,7 +148,7 @@ def init_test_case(self): self.ksize = [4, 4] self.strides = [2, 2] self.paddings = [0, 0] - self.output_size = [9, 9] + self.output_size = [12, 12] class TestUnpoolOpOutput(TestUnpoolOp): @@ -139,7 +160,7 @@ def init_test_case(self): self.ksize = [4, 4] self.strides = [2, 2] self.paddings = [0, 0] - self.output_size = [9, 9] + self.output_size = [12, 12] class TestUnpoolOpException(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py index fb250bc64b24d..fa0234227d495 100755 --- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py +++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py @@ -187,7 +187,7 @@ def test_out(self): "data2": input2 }, fetch_list=[result_squeeze]) - self.assertTrue(np.array_equal(input1, result1)) + np.testing.assert_array_equal(input1, result1) self.assertEqual(input1.shape, result1.shape) @@ -200,7 +200,7 @@ def test_out(self): input = paddle.to_tensor(input_1) output = paddle.unsqueeze(input, axis=[1]) out_np = output.numpy() - self.assertTrue(np.array_equal(input1, out_np)) + np.testing.assert_array_equal(input1, out_np) self.assertEqual(input1.shape, out_np.shape) @@ -213,7 +213,7 @@ def test_out(self): input = paddle.to_tensor(input1) output = paddle.unsqueeze(input, axis=1) out_np = output.numpy() - self.assertTrue(np.array_equal(out1, out_np)) + np.testing.assert_array_equal(out1, out_np) self.assertEqual(out1.shape, out_np.shape) @@ -227,7 +227,7 @@ def test_out(self): input = paddle.to_tensor(input1) output = paddle.unsqueeze(input, axis=paddle.to_tensor([1, 2])) out_np = output.numpy() - self.assertTrue(np.array_equal(out1, out_np)) + np.testing.assert_array_equal(out1, out_np) self.assertEqual(out1.shape, out_np.shape) @@ -245,7 +245,7 @@ def test_out(self): axis=[paddle.to_tensor([1]), paddle.to_tensor([2])]) out_np = output.numpy() - self.assertTrue(np.array_equal(out1, out_np)) + np.testing.assert_array_equal(out1, out_np) self.assertEqual(out1.shape, out_np.shape) diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py index c16238486df94..e670781bee794 100644 --- a/python/paddle/fluid/tests/unittests/test_var_base.py +++ b/python/paddle/fluid/tests/unittests/test_var_base.py @@ -39,7 +39,7 @@ def check_with_place(place): paddle.set_default_dtype('float32') # set_default_dtype should not take effect on int x = paddle.to_tensor(1, place=place, stop_gradient=False) - self.assertTrue(np.array_equal(x.numpy(), [1])) + np.testing.assert_array_equal(x.numpy(), [1]) self.assertNotEqual(x.dtype, core.VarDesc.VarType.FP32) y = paddle.to_tensor(2, place=x.place) @@ -49,8 +49,8 @@ def check_with_place(place): x = paddle.to_tensor(np.array([1.2]).astype('float16'), place=place, stop_gradient=False) - self.assertTrue( - np.array_equal(x.numpy(), np.array([1.2], 'float16'))) + np.testing.assert_array_equal(x.numpy(), + np.array([1.2], 'float16')) self.assertEqual(x.dtype, core.VarDesc.VarType.FP16) # set_default_dtype take effect on int @@ -59,20 +59,17 @@ def check_with_place(place): # set_default_dtype take effect on float x = paddle.to_tensor(1.2, place=place, stop_gradient=False) - self.assertTrue( - np.array_equal(x.numpy(), - np.array([1.2]).astype('float32'))) + np.testing.assert_array_equal(x.numpy(), + np.array([1.2]).astype('float32')) self.assertEqual(x.dtype, core.VarDesc.VarType.FP32) clone_x = x.clone() - self.assertTrue( - np.array_equal(clone_x.numpy(), - np.array([1.2]).astype('float32'))) + np.testing.assert_array_equal(clone_x.numpy(), + np.array([1.2]).astype('float32')) self.assertEqual(clone_x.dtype, core.VarDesc.VarType.FP32) y = clone_x**2 y.backward() - self.assertTrue( - np.array_equal(x.grad.numpy(), - np.array([2.4]).astype('float32'))) + np.testing.assert_array_equal(x.grad.numpy(), + np.array([2.4]).astype('float32')) y = x.cpu() self.assertEqual(y.place.__repr__(), "Place(cpu)") if core.is_compiled_with_cuda(): @@ -98,23 +95,23 @@ def check_with_place(place): # set_default_dtype take effect on complex x = paddle.to_tensor(1 + 2j, place=place, stop_gradient=False) - self.assertTrue(np.array_equal(x.numpy(), [1 + 2j])) + np.testing.assert_array_equal(x.numpy(), [1 + 2j]) self.assertEqual(x.dtype, core.VarDesc.VarType.COMPLEX64) paddle.set_default_dtype('float64') x = paddle.to_tensor(1.2, place=place, stop_gradient=False) - self.assertTrue(np.array_equal(x.numpy(), [1.2])) + np.testing.assert_array_equal(x.numpy(), [1.2]) self.assertEqual(x.dtype, core.VarDesc.VarType.FP64) x = paddle.to_tensor(1 + 2j, place=place, stop_gradient=False) - self.assertTrue(np.array_equal(x.numpy(), [1 + 2j])) + np.testing.assert_array_equal(x.numpy(), [1 + 2j]) self.assertEqual(x.dtype, core.VarDesc.VarType.COMPLEX128) x = paddle.to_tensor(1, dtype='float32', place=place, stop_gradient=False) - self.assertTrue(np.array_equal(x.numpy(), [1.])) + np.testing.assert_array_equal(x.numpy(), [1.0]) self.assertEqual(x.dtype, core.VarDesc.VarType.FP32) self.assertEqual(x.shape, [1]) self.assertEqual(x.stop_gradient, False) @@ -128,7 +125,7 @@ def check_with_place(place): dtype='float32', place=place, stop_gradient=False) - self.assertTrue(np.array_equal(x.numpy(), [1., 2.])) + np.testing.assert_array_equal(x.numpy(), [1.0, 2.0]) self.assertEqual(x.dtype, core.VarDesc.VarType.FP32) self.assertEqual(x.grad, None) self.assertEqual(x.shape, [2]) @@ -139,7 +136,7 @@ def check_with_place(place): dtype='float32', place=place, stop_gradient=False) - self.assertTrue(np.array_equal(x.numpy(), self.array)) + np.testing.assert_array_equal(x.numpy(), self.array) self.assertEqual(x.dtype, core.VarDesc.VarType.FP32) self.assertEqual(x.shape, self.shape) self.assertEqual(x.stop_gradient, False) @@ -147,19 +144,19 @@ def check_with_place(place): y = paddle.to_tensor(x) y = paddle.to_tensor(y, dtype='float64', place=place) - self.assertTrue(np.array_equal(y.numpy(), self.array)) + np.testing.assert_array_equal(y.numpy(), self.array) self.assertEqual(y.dtype, core.VarDesc.VarType.FP64) self.assertEqual(y.shape, self.shape) self.assertEqual(y.stop_gradient, True) self.assertEqual(y.type, core.VarDesc.VarType.LOD_TENSOR) z = x + y - self.assertTrue(np.array_equal(z.numpy(), 2 * self.array)) + np.testing.assert_array_equal(z.numpy(), 2 * self.array) x = paddle.to_tensor([1 + 2j, 1 - 2j], dtype='complex64', place=place) y = paddle.to_tensor(x) - self.assertTrue(np.array_equal(x.numpy(), [1 + 2j, 1 - 2j])) + np.testing.assert_array_equal(x.numpy(), [1 + 2j, 1 - 2j]) self.assertEqual(y.dtype, core.VarDesc.VarType.COMPLEX64) self.assertEqual(y.shape, [2]) @@ -168,7 +165,7 @@ def check_with_place(place): x_array = np.array(x) self.assertEqual(x_array.shape, x.numpy().shape) self.assertEqual(x_array.dtype, x.numpy().dtype) - self.assertTrue(np.array_equal(x_array, x.numpy())) + np.testing.assert_array_equal(x_array, x.numpy()) x = paddle.to_tensor(1.0) self.assertEqual(x.item(), 1.0) @@ -178,9 +175,8 @@ def check_with_place(place): self.assertTrue(isinstance(x.item(5), float)) self.assertTrue(isinstance(x.item(1, 0, 1), float)) self.assertEqual(x.item(5), x.item(1, 0, 1)) - self.assertTrue( - np.array_equal(x.item(1, 0, 1), - x.numpy().item(1, 0, 1))) + np.testing.assert_array_equal(x.item(1, 0, 1), + x.numpy().item(1, 0, 1)) x = paddle.to_tensor([[1.111111, 2.222222, 3.333333]]) self.assertEqual(x.item(0, 2), x.item(2)) @@ -228,7 +224,7 @@ def check_with_place(place): self.assertEqual(x.shape, [0]) expected_result = np.array([], dtype='float32') self.assertEqual(x.numpy().shape, expected_result.shape) - self.assertTrue(np.array_equal(x.numpy(), expected_result)) + np.testing.assert_array_equal(x.numpy(), expected_result) numpy_array = np.random.randn(3, 4) # covert core.LoDTensor to paddle.Tensor @@ -236,7 +232,7 @@ def check_with_place(place): place = paddle.fluid.framework._current_expected_place() lod_tensor.set(numpy_array, place) x = paddle.to_tensor(lod_tensor) - self.assertTrue(np.array_equal(x.numpy(), numpy_array)) + np.testing.assert_array_equal(x.numpy(), numpy_array) self.assertEqual(x.type, core.VarDesc.VarType.LOD_TENSOR) self.assertEqual(str(x.place), str(place)) @@ -245,7 +241,7 @@ def check_with_place(place): dlpack = x.value().get_tensor()._to_dlpack() tensor_from_dlpack = paddle.fluid.core.from_dlpack(dlpack) x = paddle.to_tensor(tensor_from_dlpack) - self.assertTrue(np.array_equal(x.numpy(), numpy_array)) + np.testing.assert_array_equal(x.numpy(), numpy_array) self.assertEqual(x.type, core.VarDesc.VarType.LOD_TENSOR) with self.assertRaises(ValueError): @@ -326,13 +322,13 @@ def func_test_to_tensor_with_lodtensor(self): lod_tensor = core.LoDTensor() lod_tensor.set(a_np, core.CPUPlace()) a = paddle.to_tensor(lod_tensor) - self.assertTrue(np.array_equal(a_np, a.numpy())) + np.testing.assert_array_equal(a_np, a.numpy()) with paddle.fluid.dygraph.guard(core.CUDAPlace(0)): lod_tensor = core.LoDTensor() lod_tensor.set(a_np, core.CUDAPlace(0)) a = paddle.to_tensor(lod_tensor, place=core.CPUPlace()) - self.assertTrue(np.array_equal(a_np, a.numpy())) + np.testing.assert_array_equal(a_np, a.numpy()) self.assertTrue(a.place.__repr__(), "Place(cpu)") def test_to_tensor_with_lodtensor(self): @@ -343,7 +339,7 @@ def test_to_tensor_with_lodtensor(self): def func_test_to_variable(self): with fluid.dygraph.guard(): var = fluid.dygraph.to_variable(self.array, name="abc") - self.assertTrue(np.array_equal(var.numpy(), self.array)) + np.testing.assert_array_equal(var.numpy(), self.array) self.assertEqual(var.name, 'abc') # default value self.assertEqual(var.persistable, False) @@ -368,7 +364,7 @@ def func_test_list_to_variable(self): with fluid.dygraph.guard(): array = [[[1, 2], [1, 2], [1.0, 2]], [[1, 2], [1, 2], [1, 2]]] var = fluid.dygraph.to_variable(array, dtype='int32') - self.assertTrue(np.array_equal(var.numpy(), array)) + np.testing.assert_array_equal(var.numpy(), array) self.assertEqual(var.shape, [2, 3, 2]) self.assertEqual(var.dtype, core.VarDesc.VarType.INT32) self.assertEqual(var.type, core.VarDesc.VarType.LOD_TENSOR) @@ -382,7 +378,7 @@ def func_test_tuple_to_variable(self): with fluid.dygraph.guard(): array = (((1, 2), (1, 2), (1, 2)), ((1, 2), (1, 2), (1, 2))) var = fluid.dygraph.to_variable(array, dtype='float32') - self.assertTrue(np.array_equal(var.numpy(), array)) + np.testing.assert_array_equal(var.numpy(), array) self.assertEqual(var.shape, [2, 3, 2]) self.assertEqual(var.dtype, core.VarDesc.VarType.FP32) self.assertEqual(var.type, core.VarDesc.VarType.LOD_TENSOR) @@ -397,7 +393,7 @@ def func_test_tensor_to_variable(self): t = fluid.Tensor() t.set(np.random.random((1024, 1024)), fluid.CPUPlace()) var = fluid.dygraph.to_variable(t) - self.assertTrue(np.array_equal(t, var.numpy())) + np.testing.assert_array_equal(t, var.numpy()) def test_tensor_to_variable(self): with _test_eager_guard(): @@ -516,11 +512,11 @@ def func_test_deep_copy(self): self.assertEqual(x_copy.persistable, y_copy.persistable) self.assertEqual(x_copy.type, y_copy.type) self.assertEqual(x_copy.dtype, y_copy.dtype) - self.assertTrue(np.array_equal(x.numpy(), x_copy.numpy())) - self.assertTrue(np.array_equal(y.numpy(), y_copy.numpy())) + np.testing.assert_array_equal(x.numpy(), x_copy.numpy()) + np.testing.assert_array_equal(y.numpy(), y_copy.numpy()) self.assertNotEqual(id(x), id(x_copy)) - self.assertTrue(np.array_equal(x.numpy(), [2.])) + np.testing.assert_array_equal(x.numpy(), [2.0]) with self.assertRaises(ValueError): x_copy[:] = 5. @@ -559,9 +555,9 @@ def func_test_deep_copy(self): self.assertEqual(copy_selected_rows.height(), selected_rows.height()) self.assertEqual(copy_selected_rows.rows(), selected_rows.rows()) - self.assertTrue( - np.array_equal(np.array(copy_selected_rows.get_tensor()), - np.array(selected_rows.get_tensor()))) + np.testing.assert_array_equal( + np.array(copy_selected_rows.get_tensor()), + np.array(selected_rows.get_tensor())) def test_deep_copy(self): with _test_eager_guard(): @@ -577,7 +573,7 @@ def func_test_set_value(self): tmp2 = np.random.uniform(0.1, 1, self.shape).astype(self.dtype) var.set_value(tmp2) - self.assertTrue(np.array_equal(var.numpy(), tmp2)) + np.testing.assert_array_equal(var.numpy(), tmp2) def test_set_value(self): with _test_eager_guard(): @@ -729,33 +725,32 @@ def _test_slice(self): ] local_out = [var.numpy() for var in vars] - self.assertTrue(np.array_equal(local_out[1], tensor_array[0, 1, 1:2])) - self.assertTrue(np.array_equal(local_out[2], tensor_array[1:])) - self.assertTrue(np.array_equal(local_out[3], tensor_array[0:1])) - self.assertTrue(np.array_equal(local_out[4], tensor_array[::-1])) - self.assertTrue(np.array_equal(local_out[5], tensor_array[1, 1:, 1:])) - self.assertTrue( - np.array_equal(local_out[6], - tensor_array.reshape((3, -1, 3))[:, :, -1])) - self.assertTrue(np.array_equal(local_out[7], tensor_array[:, :, :-1])) - self.assertTrue(np.array_equal(local_out[8], tensor_array[:1, :1, :1])) - self.assertTrue( - np.array_equal(local_out[9], tensor_array[:-1, :-1, :-1])) - self.assertTrue( - np.array_equal(local_out[10], tensor_array[::-1, :1, :-1])) - self.assertTrue( - np.array_equal(local_out[11], tensor_array[:-1, ::-1, -1:])) - self.assertTrue( - np.array_equal(local_out[12], tensor_array[1:2, 2:, ::-1])) - self.assertTrue( - np.array_equal(local_out[13], tensor_array[2:10, 2:, -2:-1])) - self.assertTrue( - np.array_equal(local_out[14], tensor_array[1:-1, 0:2, ::-1])) - self.assertTrue( - np.array_equal(local_out[15], tensor_array[::-1, ::-1, ::-1])) - self.assertTrue(np.array_equal(local_out[16], tensor_array[-4:4])) - self.assertTrue(np.array_equal(local_out[17], tensor_array[:, 0, 0:0])) - self.assertTrue(np.array_equal(local_out[18], tensor_array[:, 1:1:2])) + np.testing.assert_array_equal(local_out[1], tensor_array[0, 1, 1:2]) + np.testing.assert_array_equal(local_out[2], tensor_array[1:]) + np.testing.assert_array_equal(local_out[3], tensor_array[0:1]) + np.testing.assert_array_equal(local_out[4], tensor_array[::-1]) + np.testing.assert_array_equal(local_out[5], tensor_array[1, 1:, 1:]) + np.testing.assert_array_equal( + local_out[6], + tensor_array.reshape((3, -1, 3))[:, :, -1]) + np.testing.assert_array_equal(local_out[7], tensor_array[:, :, :-1]) + np.testing.assert_array_equal(local_out[8], tensor_array[:1, :1, :1]) + np.testing.assert_array_equal(local_out[9], tensor_array[:-1, :-1, :-1]) + np.testing.assert_array_equal(local_out[10], + tensor_array[::-1, :1, :-1]) + np.testing.assert_array_equal(local_out[11], tensor_array[:-1, ::-1, + -1:]) + np.testing.assert_array_equal(local_out[12], tensor_array[1:2, + 2:, ::-1]) + np.testing.assert_array_equal(local_out[13], tensor_array[2:10, 2:, + -2:-1]) + np.testing.assert_array_equal(local_out[14], tensor_array[1:-1, + 0:2, ::-1]) + np.testing.assert_array_equal(local_out[15], + tensor_array[::-1, ::-1, ::-1]) + np.testing.assert_array_equal(local_out[16], tensor_array[-4:4]) + np.testing.assert_array_equal(local_out[17], tensor_array[:, 0, 0:0]) + np.testing.assert_array_equal(local_out[18], tensor_array[:, 1:1:2]) def _test_slice_for_tensor_attr(self): tensor_array = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]], @@ -795,31 +790,30 @@ def _test_slice_for_tensor_attr(self): ] local_out = [var.numpy() for var in vars] - self.assertTrue(np.array_equal(local_out[1], tensor_array[0, 1, 1:2])) - self.assertTrue(np.array_equal(local_out[2], tensor_array[1:])) - self.assertTrue(np.array_equal(local_out[3], tensor_array[0:1])) - self.assertTrue(np.array_equal(local_out[4], tensor_array[::-1])) - self.assertTrue(np.array_equal(local_out[5], tensor_array[1, 1:, 1:])) - self.assertTrue( - np.array_equal(local_out[6], - tensor_array.reshape((3, -1, 3))[:, :, -1])) - self.assertTrue(np.array_equal(local_out[7], tensor_array[:, :, :-1])) - self.assertTrue(np.array_equal(local_out[8], tensor_array[:1, :1, :1])) - self.assertTrue( - np.array_equal(local_out[9], tensor_array[:-1, :-1, :-1])) - self.assertTrue( - np.array_equal(local_out[10], tensor_array[::-1, :1, :-1])) - self.assertTrue( - np.array_equal(local_out[11], tensor_array[:-1, ::-1, -1:])) - self.assertTrue( - np.array_equal(local_out[12], tensor_array[1:2, 2:, ::-1])) - self.assertTrue( - np.array_equal(local_out[13], tensor_array[2:10, 2:, -2:-1])) - self.assertTrue( - np.array_equal(local_out[14], tensor_array[1:-1, 0:2, ::-1])) - self.assertTrue( - np.array_equal(local_out[15], tensor_array[::-1, ::-1, ::-1])) - self.assertTrue(np.array_equal(local_out[16], tensor_array[-4:4])) + np.testing.assert_array_equal(local_out[1], tensor_array[0, 1, 1:2]) + np.testing.assert_array_equal(local_out[2], tensor_array[1:]) + np.testing.assert_array_equal(local_out[3], tensor_array[0:1]) + np.testing.assert_array_equal(local_out[4], tensor_array[::-1]) + np.testing.assert_array_equal(local_out[5], tensor_array[1, 1:, 1:]) + np.testing.assert_array_equal( + local_out[6], + tensor_array.reshape((3, -1, 3))[:, :, -1]) + np.testing.assert_array_equal(local_out[7], tensor_array[:, :, :-1]) + np.testing.assert_array_equal(local_out[8], tensor_array[:1, :1, :1]) + np.testing.assert_array_equal(local_out[9], tensor_array[:-1, :-1, :-1]) + np.testing.assert_array_equal(local_out[10], + tensor_array[::-1, :1, :-1]) + np.testing.assert_array_equal(local_out[11], tensor_array[:-1, ::-1, + -1:]) + np.testing.assert_array_equal(local_out[12], tensor_array[1:2, + 2:, ::-1]) + np.testing.assert_array_equal(local_out[13], tensor_array[2:10, 2:, + -2:-1]) + np.testing.assert_array_equal(local_out[14], tensor_array[1:-1, + 0:2, ::-1]) + np.testing.assert_array_equal(local_out[15], + tensor_array[::-1, ::-1, ::-1]) + np.testing.assert_array_equal(local_out[16], tensor_array[-4:4]) def _test_for_getitem_ellipsis_index(self): shape = (64, 3, 5, 256) @@ -842,15 +836,15 @@ def assert_getitem_ellipsis_index(var_tensor, var_np): var_tensor[:, ..., 100].numpy(), ] - self.assertTrue(np.array_equal(var[0], var_np[..., 0])) - self.assertTrue(np.array_equal(var[1], var_np[..., 1, 0])) - self.assertTrue(np.array_equal(var[2], var_np[0, ..., 1, 0])) - self.assertTrue(np.array_equal(var[3], var_np[1, ..., 1])) - self.assertTrue(np.array_equal(var[4], var_np[2, ...])) - self.assertTrue(np.array_equal(var[5], var_np[2, 0, ...])) - self.assertTrue(np.array_equal(var[6], var_np[2, 0, 1, ...])) - self.assertTrue(np.array_equal(var[7], var_np[...])) - self.assertTrue(np.array_equal(var[8], var_np[:, ..., 100])) + np.testing.assert_array_equal(var[0], var_np[..., 0]) + np.testing.assert_array_equal(var[1], var_np[..., 1, 0]) + np.testing.assert_array_equal(var[2], var_np[0, ..., 1, 0]) + np.testing.assert_array_equal(var[3], var_np[1, ..., 1]) + np.testing.assert_array_equal(var[4], var_np[2, ...]) + np.testing.assert_array_equal(var[5], var_np[2, 0, ...]) + np.testing.assert_array_equal(var[6], var_np[2, 0, 1, ...]) + np.testing.assert_array_equal(var[7], var_np[...]) + np.testing.assert_array_equal(var[8], var_np[:, ..., 100]) var_fp32 = paddle.to_tensor(np_fp32_value) var_int = paddle.to_tensor(np_int_value) @@ -860,8 +854,8 @@ def assert_getitem_ellipsis_index(var_tensor, var_np): # test 1 dim tensor var_one_dim = paddle.to_tensor([1, 2, 3, 4]) - self.assertTrue( - np.array_equal(var_one_dim[..., 0].numpy(), np.array([1]))) + np.testing.assert_array_equal(var_one_dim[..., 0].numpy(), + np.array([1])) def _test_none_index(self): shape = (8, 64, 5, 256) @@ -883,19 +877,18 @@ def _test_none_index(self): var_tensor[0, 1:10:2, None, None, ...].numpy(), ] - self.assertTrue(np.array_equal(var[0], np_value[1, 0, None])) - self.assertTrue(np.array_equal(var[1], np_value[None, ..., 1, 0])) - self.assertTrue(np.array_equal(var[2], np_value[:, :, :, None])) - self.assertTrue(np.array_equal(var[3], np_value[1, ..., 1, None])) - self.assertTrue(np.array_equal(var[4], np_value[2, ..., None, None])) - self.assertTrue(np.array_equal(var[5], np_value[None, 2, 0, ...])) - self.assertTrue(np.array_equal(var[6], np_value[None, 2, None, 1])) - self.assertTrue(np.array_equal(var[7], np_value[None])) - self.assertTrue(np.array_equal(var[8], np_value[0, 0, None, 0, 0, - None])) - self.assertTrue( - np.array_equal(var[9], np_value[None, None, 0, ..., None])) - self.assertTrue(np.array_equal(var[10], np_value[..., None, :, None])) + np.testing.assert_array_equal(var[0], np_value[1, 0, None]) + np.testing.assert_array_equal(var[1], np_value[None, ..., 1, 0]) + np.testing.assert_array_equal(var[2], np_value[:, :, :, None]) + np.testing.assert_array_equal(var[3], np_value[1, ..., 1, None]) + np.testing.assert_array_equal(var[4], np_value[2, ..., None, None]) + np.testing.assert_array_equal(var[5], np_value[None, 2, 0, ...]) + np.testing.assert_array_equal(var[6], np_value[None, 2, None, 1]) + np.testing.assert_array_equal(var[7], np_value[None]) + np.testing.assert_array_equal(var[8], np_value[0, 0, None, 0, 0, None]) + np.testing.assert_array_equal(var[9], np_value[None, None, 0, ..., + None]) + np.testing.assert_array_equal(var[10], np_value[..., None, :, None]) # TODO(zyfncg) there is a bug of dimensions when slice step > 1 and # indexs has int type @@ -919,19 +912,17 @@ def _test_bool_index(self): var_tensor[tensor_index].numpy(), var_tensor[paddle.to_tensor(index[4])].numpy() ] - self.assertTrue(np.array_equal(var[0], np_value[index[0]])) - self.assertTrue(np.array_equal(var[1], np_value[index[1]])) - self.assertTrue(np.array_equal(var[2], np_value[index[2]])) - self.assertTrue(np.array_equal(var[3], np_value[index[3]])) - self.assertTrue(np.array_equal(var[4], np_value[index[0]])) - self.assertTrue(np.array_equal(var[5], np_value[index2d])) - self.assertTrue(np.array_equal(var[6], np_value[index[4]])) - self.assertTrue( - np.array_equal(var_tensor[var_tensor > 0.67], - np_value[np_value > 0.67])) - self.assertTrue( - np.array_equal(var_tensor[var_tensor < 0.55], - np_value[np_value < 0.55])) + np.testing.assert_array_equal(var[0], np_value[index[0]]) + np.testing.assert_array_equal(var[1], np_value[index[1]]) + np.testing.assert_array_equal(var[2], np_value[index[2]]) + np.testing.assert_array_equal(var[3], np_value[index[3]]) + np.testing.assert_array_equal(var[4], np_value[index[0]]) + np.testing.assert_array_equal(var[5], np_value[index2d]) + np.testing.assert_array_equal(var[6], np_value[index[4]]) + np.testing.assert_array_equal(var_tensor[var_tensor > 0.67], + np_value[np_value > 0.67]) + np.testing.assert_array_equal(var_tensor[var_tensor < 0.55], + np_value[np_value < 0.55]) with self.assertRaises(ValueError): var_tensor[[False, False, False, False]] @@ -951,30 +942,27 @@ def _test_scalar_bool_index(self): var = [ var_tensor[tensor_index].numpy(), ] - self.assertTrue(np.array_equal(var[0], np_value[index])) + np.testing.assert_array_equal(var[0], np_value[index]) def _test_for_var(self): np_value = np.random.random((30, 100, 100)).astype('float32') w = fluid.dygraph.to_variable(np_value) for i, e in enumerate(w): - self.assertTrue(np.array_equal(e.numpy(), np_value[i])) + np.testing.assert_array_equal(e.numpy(), np_value[i]) def _test_numpy_index(self): array = np.arange(120).reshape([4, 5, 6]) t = paddle.to_tensor(array) - self.assertTrue(np.array_equal(t[np.longlong(0)].numpy(), array[0])) - self.assertTrue( - np.array_equal( - t[np.longlong(0):np.longlong(4):np.longlong(2)].numpy(), - array[0:4:2])) - self.assertTrue(np.array_equal(t[np.int64(0)].numpy(), array[0])) - self.assertTrue( - np.array_equal(t[np.int32(1):np.int32(4):np.int32(2)].numpy(), - array[1:4:2])) - self.assertTrue( - np.array_equal(t[np.int16(0):np.int16(4):np.int16(2)].numpy(), - array[0:4:2])) + np.testing.assert_array_equal(t[np.longlong(0)].numpy(), array[0]) + np.testing.assert_array_equal( + t[np.longlong(0):np.longlong(4):np.longlong(2)].numpy(), + array[0:4:2]) + np.testing.assert_array_equal(t[np.int64(0)].numpy(), array[0]) + np.testing.assert_array_equal( + t[np.int32(1):np.int32(4):np.int32(2)].numpy(), array[1:4:2]) + np.testing.assert_array_equal( + t[np.int16(0):np.int16(4):np.int16(2)].numpy(), array[0:4:2]) def _test_list_index(self): # case1: @@ -982,8 +970,8 @@ def _test_list_index(self): x = paddle.to_tensor(array) py_idx = [[0, 2, 0, 1, 3], [0, 0, 1, 2, 0]] idx = [paddle.to_tensor(py_idx[0]), paddle.to_tensor(py_idx[1])] - self.assertTrue(np.array_equal(x[idx].numpy(), array[py_idx])) - self.assertTrue(np.array_equal(x[py_idx].numpy(), array[py_idx])) + np.testing.assert_array_equal(x[idx].numpy(), array[py_idx]) + np.testing.assert_array_equal(x[py_idx].numpy(), array[py_idx]) # case2: tensor_x = paddle.to_tensor( np.zeros(12).reshape(2, 6).astype(np.float32)) @@ -993,12 +981,12 @@ def _test_list_index(self): res = tensor_x.numpy() exp = np.array([[0., 0., 42., 42., 42., 0.], [0., 0., 42., 42., 42., 0.]]) - self.assertTrue(np.array_equal(res, exp)) + np.testing.assert_array_equal(res, exp) # case3: row = np.array([0, 1, 2]) col = np.array([2, 1, 3]) - self.assertTrue(np.array_equal(array[row, col], x[row, col].numpy())) + np.testing.assert_array_equal(array[row, col], x[row, col].numpy()) def func_test_slice(self): with fluid.dygraph.guard(): @@ -1013,8 +1001,8 @@ def func_test_slice(self): self._test_list_index() var = fluid.dygraph.to_variable(self.array) - self.assertTrue(np.array_equal(var[1, :].numpy(), self.array[1, :])) - self.assertTrue(np.array_equal(var[::-1].numpy(), self.array[::-1])) + np.testing.assert_array_equal(var[1, :].numpy(), self.array[1, :]) + np.testing.assert_array_equal(var[::-1].numpy(), self.array[::-1]) with self.assertRaises(IndexError): y = var[self.shape[0]] @@ -1034,9 +1022,8 @@ def test_slice(self): def func_test_var_base_to_np(self): with fluid.dygraph.guard(): var = fluid.dygraph.to_variable(self.array) - self.assertTrue( - np.array_equal(var.numpy(), - fluid.framework._var_base_to_np(var))) + np.testing.assert_array_equal(var.numpy(), + fluid.framework._var_base_to_np(var)) def test_var_base_to_np(self): with _test_eager_guard(): @@ -1046,9 +1033,9 @@ def test_var_base_to_np(self): def func_test_var_base_as_np(self): with fluid.dygraph.guard(): var = fluid.dygraph.to_variable(self.array) - self.assertTrue(np.array_equal(var.numpy(), np.array(var))) - self.assertTrue( - np.array_equal(var.numpy(), np.array(var, dtype=np.float32))) + np.testing.assert_array_equal(var.numpy(), np.array(var)) + np.testing.assert_array_equal(var.numpy(), + np.array(var, dtype=np.float32)) def test_var_base_as_np(self): with _test_eager_guard(): @@ -1342,19 +1329,19 @@ def _test(self, value): else: result = self.np_value - self.assertTrue(np.array_equal(self.tensor_x[0].numpy(), result)) + np.testing.assert_array_equal(self.tensor_x[0].numpy(), result) self.assertEqual(id_origin, id(self.tensor_x)) self.tensor_x[1:2] = value if _in_legacy_dygraph(): self.assertEqual(self.tensor_x.inplace_version, 2) - self.assertTrue(np.array_equal(self.tensor_x[1].numpy(), result)) + np.testing.assert_array_equal(self.tensor_x[1].numpy(), result) self.assertEqual(id_origin, id(self.tensor_x)) self.tensor_x[...] = value if _in_legacy_dygraph(): self.assertEqual(self.tensor_x.inplace_version, 3) - self.assertTrue(np.array_equal(self.tensor_x[3].numpy(), result)) + np.testing.assert_array_equal(self.tensor_x[3].numpy(), result) self.assertEqual(id_origin, id(self.tensor_x)) def func_test_value_tensor(self): @@ -1447,19 +1434,19 @@ def _test(self, value): else: result = self.np_value - self.assertTrue(np.array_equal(self.tensor_x[0].numpy(), result)) + np.testing.assert_array_equal(self.tensor_x[0].numpy(), result) self.assertEqual(id_origin, id(self.tensor_x)) index_2 = paddle.to_tensor(np.array([False, True, False, False])) self.tensor_x[index_2] = value self.assertEqual(self.tensor_x.inplace_version, 2) - self.assertTrue(np.array_equal(self.tensor_x[1].numpy(), result)) + np.testing.assert_array_equal(self.tensor_x[1].numpy(), result) self.assertEqual(id_origin, id(self.tensor_x)) index_3 = paddle.to_tensor(np.array([True, True, True, True])) self.tensor_x[index_3] = value self.assertEqual(self.tensor_x.inplace_version, 3) - self.assertTrue(np.array_equal(self.tensor_x[3].numpy(), result)) + np.testing.assert_array_equal(self.tensor_x[3].numpy(), result) self.assertEqual(id_origin, id(self.tensor_x)) def func_test_value_tensor(self): @@ -1518,7 +1505,7 @@ def _test(self, value): else: result = self.np_value - self.assertTrue(np.array_equal(self.tensor_x[0].numpy(), result)) + np.testing.assert_array_equal(self.tensor_x[0].numpy(), result) self.assertEqual(id_origin, id(self.tensor_x)) diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py index 5fb220da609a4..7fc6ec935933d 100644 --- a/python/paddle/fluid/tests/unittests/test_variable.py +++ b/python/paddle/fluid/tests/unittests/test_variable.py @@ -178,34 +178,31 @@ def _test_slice(self, place): var13, var14, var15 ]) - self.assertTrue( - np.array_equal(local_out[1], tensor_array[0, 1, 1:2])) - self.assertTrue(np.array_equal(local_out[2], tensor_array[1:])) - self.assertTrue(np.array_equal(local_out[3], tensor_array[0:1])) - self.assertTrue(np.array_equal(local_out[4], tensor_array[::-1])) - self.assertTrue( - np.array_equal(local_out[5], tensor_array[1, 1:, 1:])) - self.assertTrue( - np.array_equal(local_out[6], - tensor_array.reshape((3, -1, 3))[:, :, -1])) - self.assertTrue( - np.array_equal(local_out[7], tensor_array[:, :, :-1])) - self.assertTrue( - np.array_equal(local_out[8], tensor_array[:1, :1, :1])) - self.assertTrue( - np.array_equal(local_out[9], tensor_array[:-1, :-1, :-1])) - self.assertTrue( - np.array_equal(local_out[10], tensor_array[::-1, :1, :-1])) - self.assertTrue( - np.array_equal(local_out[11], tensor_array[:-1, ::-1, -1:])) - self.assertTrue( - np.array_equal(local_out[12], tensor_array[1:2, 2:, ::-1])) - self.assertTrue( - np.array_equal(local_out[13], tensor_array[2:10, 2:, -2:-1])) - self.assertTrue( - np.array_equal(local_out[14], tensor_array[1:-1, 0:2, ::-1])) - self.assertTrue( - np.array_equal(local_out[15], tensor_array[::-1, ::-1, ::-1])) + np.testing.assert_array_equal(local_out[1], tensor_array[0, 1, 1:2]) + np.testing.assert_array_equal(local_out[2], tensor_array[1:]) + np.testing.assert_array_equal(local_out[3], tensor_array[0:1]) + np.testing.assert_array_equal(local_out[4], tensor_array[::-1]) + np.testing.assert_array_equal(local_out[5], tensor_array[1, 1:, 1:]) + np.testing.assert_array_equal( + local_out[6], + tensor_array.reshape((3, -1, 3))[:, :, -1]) + np.testing.assert_array_equal(local_out[7], tensor_array[:, :, :-1]) + np.testing.assert_array_equal(local_out[8], + tensor_array[:1, :1, :1]) + np.testing.assert_array_equal(local_out[9], + tensor_array[:-1, :-1, :-1]) + np.testing.assert_array_equal(local_out[10], + tensor_array[::-1, :1, :-1]) + np.testing.assert_array_equal(local_out[11], tensor_array[:-1, ::-1, + -1:]) + np.testing.assert_array_equal(local_out[12], tensor_array[1:2, + 2:, ::-1]) + np.testing.assert_array_equal(local_out[13], tensor_array[2:10, 2:, + -2:-1]) + np.testing.assert_array_equal(local_out[14], + tensor_array[1:-1, 0:2, ::-1]) + np.testing.assert_array_equal(local_out[15], + tensor_array[::-1, ::-1, ::-1]) def _test_slice_index_tensor(self, place): data = np.random.rand(2, 3).astype("float32") @@ -593,7 +590,7 @@ def test_static_graph_list_index(self): getitem_pp = exe.run(prog, feed={x.name: array}, fetch_list=fetch_list) - self.assertTrue(np.array_equal(getitem_np, getitem_pp[0])) + np.testing.assert_array_equal(getitem_np, getitem_pp[0]) array = array[0] index = index[0] @@ -620,7 +617,7 @@ def test_dygraph_list_index(self): index = index[0] continue getitem_pp = pt[index_mod] - self.assertTrue(np.array_equal(getitem_np, getitem_pp.numpy())) + np.testing.assert_array_equal(getitem_np, getitem_pp.numpy()) array = array[0] index = index[0] @@ -680,9 +677,10 @@ def test_static_graph_list_index_muti_dim(self): }, fetch_list=fetch_list) - self.assertTrue(np.array_equal(y2, getitem_pp[0]), - msg='\n numpy:{},\n paddle:{}'.format( - y2, getitem_pp[0])) + np.testing.assert_array_equal( + y2, + getitem_pp[0], + err_msg='\n numpy:{},\n paddle:{}'.format(y2, getitem_pp[0])) def test_dygraph_list_index_muti_dim(self): paddle.disable_static() @@ -707,7 +705,7 @@ def test_dygraph_list_index_muti_dim(self): y_np = array[index_t1, index_t2] y = x[index_t1, index_t2] - self.assertTrue(np.array_equal(y.numpy(), y_np)) + np.testing.assert_array_equal(y.numpy(), y_np) def run_getitem_list_index(self, array, index): x = paddle.static.data(name='x', shape=array.shape, dtype='float32') @@ -966,12 +964,16 @@ def test_static_graph_tensor_index_setitem_muti_dim(self): index_2.name: index_mod2 }, fetch_list=fetch_list) - self.assertTrue(np.array_equal(array2, setitem_pp[0]), - msg='\n numpy:{},\n paddle:{}'.format( - array2, setitem_pp[0])) - self.assertTrue(np.array_equal(array3, setitem_pp[1]), - msg='\n numpy:{},\n paddle:{}'.format( - array3, setitem_pp[1])) + np.testing.assert_array_equal( + array2, + setitem_pp[0], + err_msg='\n numpy:{},\n paddle:{}'.format( + array2, setitem_pp[0])) + np.testing.assert_array_equal( + array3, + setitem_pp[1], + err_msg='\n numpy:{},\n paddle:{}'.format( + array3, setitem_pp[1])) array = array[0] index1 = index1[0] index2 = index2[0] @@ -1028,19 +1030,27 @@ def test_static_graph_array_index_muti_dim(self): x2.name: array }, fetch_list=fetch_list) - self.assertTrue(np.array_equal(array2, setitem_pp[0]), - msg='\n numpy:{},\n paddle:{}'.format( - array2, setitem_pp[0])) - self.assertTrue(np.array_equal(array3, setitem_pp[1]), - msg='\n numpy:{},\n paddle:{}'.format( - array3, setitem_pp[1])) - - self.assertTrue(np.array_equal(y_np1, setitem_pp[2]), - msg='\n numpy:{},\n paddle:{}'.format( - y_np1, setitem_pp[2])) - self.assertTrue(np.array_equal(y_np2, setitem_pp[3]), - msg='\n numpy:{},\n paddle:{}'.format( - y_np2, setitem_pp[3])) + np.testing.assert_array_equal( + array2, + setitem_pp[0], + err_msg='\n numpy:{},\n paddle:{}'.format( + array2, setitem_pp[0])) + np.testing.assert_array_equal( + array3, + setitem_pp[1], + err_msg='\n numpy:{},\n paddle:{}'.format( + array3, setitem_pp[1])) + + np.testing.assert_array_equal( + y_np1, + setitem_pp[2], + err_msg='\n numpy:{},\n paddle:{}'.format( + y_np1, setitem_pp[2])) + np.testing.assert_array_equal( + y_np2, + setitem_pp[3], + err_msg='\n numpy:{},\n paddle:{}'.format( + y_np2, setitem_pp[3])) array = array[0] index1 = index1[0] index2 = index2[0] @@ -1069,26 +1079,30 @@ def test_dygraph_array_index_muti_dim(self): y_t1 = tensor1[index_mod_t2, index_mod_t1] - self.assertTrue(np.array_equal(y_t1.numpy(), y_np1), - msg='\n numpy:{},\n paddle:{}'.format( - y_np1, y_t1.numpy())) + np.testing.assert_array_equal( + y_t1.numpy(), + y_np1, + err_msg='\n numpy:{},\n paddle:{}'.format(y_np1, y_t1.numpy())) # 1 dim getitem array2 = array.copy() y_np2 = array2[index_mod2] tensor2 = paddle.to_tensor(array) y_t2 = tensor2[index_mod_t2] - self.assertTrue(np.array_equal(y_t2.numpy(), y_np2), - msg='\n numpy:{},\n paddle:{}'.format( - y_np2, y_t2.numpy())) + np.testing.assert_array_equal( + y_t2.numpy(), + y_np2, + err_msg='\n numpy:{},\n paddle:{}'.format(y_np2, y_t2.numpy())) # 2 dim setitem array1 = array.copy() array1[index_mod1, index_mod2] = 1 tensor1[index_mod_t1, index_mod_t2] = 1 - self.assertTrue(np.array_equal(tensor1.numpy(), array1), - msg='\n numpy:{},\n paddle:{}'.format( - array1, tensor1.numpy())) + np.testing.assert_array_equal( + tensor1.numpy(), + array1, + err_msg='\n numpy:{},\n paddle:{}'.format( + array1, tensor1.numpy())) # 1 dim setitem array2 = array.copy() @@ -1096,9 +1110,11 @@ def test_dygraph_array_index_muti_dim(self): tensor2[index_mod_t1] = 2.5 - self.assertTrue(np.array_equal(tensor2.numpy(), array2), - msg='\n numpy:{},\n paddle:{}'.format( - array2, tensor2.numpy())) + np.testing.assert_array_equal( + tensor2.numpy(), + array2, + err_msg='\n numpy:{},\n paddle:{}'.format( + array2, tensor2.numpy())) array = array[0] index1 = index1[0] diff --git a/python/paddle/fluid/tests/unittests/test_view_op_reuse_allocation.py b/python/paddle/fluid/tests/unittests/test_view_op_reuse_allocation.py index a70d8e209b33d..ab9fa633bd56c 100644 --- a/python/paddle/fluid/tests/unittests/test_view_op_reuse_allocation.py +++ b/python/paddle/fluid/tests/unittests/test_view_op_reuse_allocation.py @@ -48,7 +48,7 @@ def func_test_view_api(self): var_numpy = var.numpy().reshape(self.output_shape) view_var_numpy = view_var.numpy() - self.assertTrue(np.array_equal(var_numpy, view_var_numpy)) + np.testing.assert_array_equal(var_numpy, view_var_numpy) def test_view_api(self): with _test_eager_guard(): diff --git a/python/paddle/fluid/tests/unittests/test_where_op.py b/python/paddle/fluid/tests/unittests/test_where_op.py index 967f917fd93b7..83381ac9fcd14 100644 --- a/python/paddle/fluid/tests/unittests/test_where_op.py +++ b/python/paddle/fluid/tests/unittests/test_where_op.py @@ -282,7 +282,7 @@ def __test_where_with_broadcast_dygraph(self, cond_shape, a_shape, b_shape): result = paddle.where(cond, a, b) result = result.numpy() expect = np.where(cond, a, b) - self.assertTrue(np.array_equal(expect, result)) + np.testing.assert_array_equal(expect, result) def test_dygraph_api_broadcast_1(self): cond_shape = [2, 4] diff --git a/python/paddle/fluid/tests/unittests/test_while_loop_op.py b/python/paddle/fluid/tests/unittests/test_while_loop_op.py index 92d67406b033a..d97722c09805a 100644 --- a/python/paddle/fluid/tests/unittests/test_while_loop_op.py +++ b/python/paddle/fluid/tests/unittests/test_while_loop_op.py @@ -587,7 +587,7 @@ def body(z, i): np_x = np.array([1, 2, 3, 4, 5], dtype='int32') res = exe.run(main_program, feed={'x': np_x}, fetch_list=[z]) - self.assertTrue(np.array_equal(res[0], [np.sum(np_x)])) + np.testing.assert_array_equal(res[0], [np.sum(np_x)]) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py index 648e87f8c3174..6baf40486ddb6 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py @@ -70,8 +70,8 @@ def test_assign_LoDTensorArray(self): res = exe.run(main_program, feed={'x': feed_x}, fetch_list=[sums.name, x.grad_name]) - self.assertTrue(np.allclose(res[0], feed_add)) - self.assertTrue(np.allclose(res[1], ones / 1000.0)) + np.testing.assert_allclose(res[0], feed_add) + np.testing.assert_allclose(res[1], ones / 1000.0) class TestAssignOpError(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py index 6455b157cb2ca..175bf152fe113 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py @@ -15,7 +15,7 @@ from __future__ import print_function import unittest -import numpy +import numpy as np import sys sys.path.append("..") @@ -53,7 +53,7 @@ def setUp(self): self.outputs = {"Out": self.value} def init_data(self): - self.value = numpy.random.random(size=(2, 5)).astype(self.dtype) + self.value = np.random.random(size=(2, 5)).astype(self.dtype) self.attrs["fp32_values"] = [float(v) for v in self.value.flat] def test_forward(self): @@ -62,20 +62,20 @@ def test_forward(self): class TestAssignValueOp2(TestAssignValueOp): def init_data(self): - self.value = numpy.random.random(size=(2, 5)).astype(numpy.int32) + self.value = np.random.random(size=(2, 5)).astype(np.int32) self.attrs["int32_values"] = [int(v) for v in self.value.flat] class TestAssignValueOp3(TestAssignValueOp): def init_data(self): - self.value = numpy.random.random(size=(2, 5)).astype(numpy.int64) + self.value = np.random.random(size=(2, 5)).astype(np.int64) self.attrs["int64_values"] = [int(v) for v in self.value.flat] class TestAssignValueOp4(TestAssignValueOp): def init_data(self): - self.value = numpy.random.choice(a=[False, True], - size=(2, 5)).astype(numpy.bool) + self.value = np.random.choice(a=[False, True], + size=(2, 5)).astype(np.bool) self.attrs["bool_values"] = [int(v) for v in self.value.flat] @@ -83,7 +83,7 @@ class TestAssignApi(unittest.TestCase): def setUp(self): self.init_dtype() - self.value = (-100 + 200 * numpy.random.random(size=(2, 5))).astype( + self.value = (-100 + 200 * np.random.random(size=(2, 5))).astype( self.dtype) self.place = fluid.XPUPlace(0) @@ -98,8 +98,10 @@ def test_assign(self): exe = fluid.Executor(self.place) [fetched_x] = exe.run(main_program, feed={}, fetch_list=[x]) - self.assertTrue(numpy.array_equal(fetched_x, self.value), - "fetch_x=%s val=%s" % (fetched_x, self.value)) + np.testing.assert_allclose(fetched_x, + self.value, + err_msg="fetch_x=%s val=%s" % + (fetched_x, self.value)) self.assertEqual(fetched_x.dtype, self.value.dtype) @@ -119,8 +121,8 @@ class TestAssignApi4(TestAssignApi): def setUp(self): self.init_dtype() - self.value = numpy.random.choice(a=[False, True], - size=(2, 5)).astype(numpy.bool) + self.value = np.random.choice(a=[False, True], + size=(2, 5)).astype(np.bool) self.place = fluid.XPUPlace(0) def init_dtype(self): diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_op_xpu.py index 9f15b72fe7d8b..f5c3e2b6a96e4 100755 --- a/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_op_xpu.py @@ -512,7 +512,7 @@ def test_case(self): expect_res = bilinear_interp_np( x_data, out_h=12, out_w=12, align_corners=True) for res in results: - self.assertTrue(np.allclose(res, expect_res)) + np.testing.assert_allclose(res, expect_res) ''' if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py index 33198a28933a5..d0b41f459eaf2 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py @@ -186,14 +186,14 @@ def test_clip(self): }, fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8]) - self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8))) - self.assertTrue(np.allclose(res2, data.clip(0.2, 0.9))) - self.assertTrue(np.allclose(res3, data.clip(min=0.3))) - self.assertTrue(np.allclose(res4, data.clip(max=0.7))) - self.assertTrue(np.allclose(res5, data.clip(min=0.2))) - self.assertTrue(np.allclose(res6, data.clip(max=0.8))) - self.assertTrue(np.allclose(res7, data.clip(max=-1))) - self.assertTrue(np.allclose(res8, data)) + np.testing.assert_allclose(res1, data.clip(0.2, 0.8)) + np.testing.assert_allclose(res2, data.clip(0.2, 0.9)) + np.testing.assert_allclose(res3, data.clip(min=0.3)) + np.testing.assert_allclose(res4, data.clip(max=0.7)) + np.testing.assert_allclose(res5, data.clip(min=0.2)) + np.testing.assert_allclose(res6, data.clip(max=0.8)) + np.testing.assert_allclose(res7, data.clip(max=-1)) + np.testing.assert_allclose(res8, data) paddle.disable_static() def test_clip_dygraph(self): @@ -213,9 +213,9 @@ def test_clip_dygraph(self): images = paddle.to_tensor(data, dtype='float32') out_3 = self._executed_api(images, min=v_min, max=v_max) - self.assertTrue(np.allclose(out_1.numpy(), data.clip(0.2, 0.8))) - self.assertTrue(np.allclose(out_2.numpy(), data.clip(0.2, 0.9))) - self.assertTrue(np.allclose(out_3.numpy(), data.clip(0.2, 0.8))) + np.testing.assert_allclose(out_1.numpy(), data.clip(0.2, 0.8)) + np.testing.assert_allclose(out_2.numpy(), data.clip(0.2, 0.9)) + np.testing.assert_allclose(out_3.numpy(), data.clip(0.2, 0.8)) def test_errors(self): paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py index b4e8cf6b10e37..3227d76a64222 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py @@ -164,7 +164,7 @@ def test_dygraph(self): m = paddle.nn.Dropout(p=0.) m.eval() result = m(input) - self.assertTrue(np.allclose(result.numpy(), result_np)) + np.testing.assert_allclose(result.numpy(), result_np) class TestDropoutBackward(unittest.TestCase): @@ -188,10 +188,9 @@ def test_backward_downscale_in_infer(self): out, mask = core.ops.dropout(input, 'dropout_prob', 0.5) out.backward() - self.assertTrue( - np.array_equal( - input.gradient(), - self.cal_grad_downscale_in_infer(mask.numpy()))) + np.testing.assert_allclose( + input.gradient(), + self.cal_grad_downscale_in_infer(mask.numpy())) def test_backward_upscale_train(self): for place in self.places: @@ -205,10 +204,9 @@ def test_backward_upscale_train(self): "upscale_in_train") out.backward() - self.assertTrue( - np.allclose( - input.gradient(), - self.cal_grad_upscale_train(mask.numpy(), prob))) + np.testing.assert_allclose( + input.gradient(), + self.cal_grad_upscale_train(mask.numpy(), prob)) def test_backward_upscale_train_2(self): for place in self.places: @@ -222,10 +220,9 @@ def test_backward_upscale_train_2(self): "upscale_in_train") out.backward() - self.assertTrue( - np.allclose( - input.gradient(), - self.cal_grad_upscale_train(mask.numpy(), prob))) + np.testing.assert_allclose( + input.gradient(), + self.cal_grad_upscale_train(mask.numpy(), prob)) support_types = get_xpu_op_support_types('dropout') diff --git a/python/paddle/fluid/tests/unittests/xpu/test_empty_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_empty_op_xpu.py new file mode 100644 index 0000000000000..dc03827a74d58 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_empty_op_xpu.py @@ -0,0 +1,134 @@ +#Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import sys + +sys.path.append("..") + +import unittest +import numpy as np +import paddle +import paddle.fluid as fluid +from op_test_xpu import XPUOpTest +from paddle.fluid import Program, program_guard +from paddle.fluid.framework import convert_np_dtype_to_dtype_ +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + +paddle.enable_static() + + +class XPUTestEmptyOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'empty' + self.use_dynamic_create_class = False + + # Situation 1: Attr(shape) is a list(without tensor) + class TestEmptyOp(XPUOpTest): + + def setUp(self): + self.op_type = "empty" + self.init_dtype() + self.set_xpu() + self.place = paddle.XPUPlace(0) + self.set_shape() + self.set_inputs() + self.init_config() + + def test_check_output(self): + self.check_output_customized(self.verify_output) + + def verify_output(self, outs): + data_type = outs[0].dtype + if data_type in [ + 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', + 'float16', 'int16' + ]: + max_value = np.nanmax(outs[0]) + min_value = np.nanmin(outs[0]) + + always_full_zero = max_value == 0.0 and min_value == 0.0 + always_non_full_zero = max_value >= min_value + self.assertTrue(always_full_zero or always_non_full_zero, + 'always_full_zero or always_non_full_zero.') + elif data_type in ['bool']: + total_num = outs[0].size + true_num = np.sum(outs[0] == True) + false_num = np.sum(outs[0] == False) + self.assertTrue(total_num == true_num + false_num, + 'The value should always be True or False.') + else: + #pass + self.assertTrue(False, 'invalid data type') + + def set_shape(self): + self.shape = [500, 3] + + def set_inputs(self): + self.inputs = {} + + def init_config(self): + dtype_inner = convert_np_dtype_to_dtype_(self.dtype) + self.attrs = {'shape': self.shape, 'dtype': dtype_inner} + self.outputs = {'Out': np.zeros(self.shape).astype(self.dtype)} + + def init_dtype(self): + self.dtype = self.in_type + + def set_xpu(self): + self.__class__.use_xpu = True + self.__class__.no_need_check_grad = True + self.__class__.op_type = self.op_type + + class TestEmptyOpCase1(TestEmptyOp): + + def set_shape(self): + self.shape = [50] + + class TestEmptyOpCase2(TestEmptyOp): + + def set_shape(self): + self.shape = [1, 50, 3, 4] + + class TestEmptyOpCase3(TestEmptyOp): + + def set_shape(self): + self.shape = [5, 5, 5] + + # Situation 2: shape is a tensor + class TestEmptyOp_ShapeTensor(TestEmptyOp): + + def set_inputs(self): + self.inputs = {"ShapeTensor": np.array(self.shape).astype("int32")} + + # Situation 3: Attr(shape) is a list(with tensor) + class TestEmptyOp_ShapeTensorList(TestEmptyOp): + + def set_inputs(self): + shape_tensor_list = [] + for index, ele in enumerate(self.shape): + shape_tensor_list.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + + self.inputs = {"ShapeTensorList": shape_tensor_list} + + +support_types = get_xpu_op_support_types('empty') +for stype in support_types: + create_test_class(globals(), XPUTestEmptyOp, stype) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fleet_exe_dist_model_run_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fleet_exe_dist_model_run_xpu.py new file mode 100644 index 0000000000000..867379bf81ef5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_fleet_exe_dist_model_run_xpu.py @@ -0,0 +1,93 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import numpy as np +import os +import tempfile +from paddle.fluid import core + +paddle.enable_static() + + +class TestDistModelRun(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + # step 6: clean up the env, delete the saved model and params + print('cleaned up the env') + self.temp_dir.cleanup() + + def test_dist_model_run(self): + # step 0: declare folder to save the model and params + path_prefix = os.path.join(self.temp_dir.name, + "dist_model_run_test/inf") + + # step 1: saving the inference model and params + x = paddle.static.data(name='x', shape=[28, 28], dtype='float32') + y = paddle.static.data(name='y', shape=[28, 1], dtype='int64') + predict = paddle.static.nn.fc(x, 10, activation='softmax') + loss = paddle.nn.functional.cross_entropy(predict, y) + avg_loss = paddle.tensor.stat.mean(loss) + exe = paddle.static.Executor(paddle.XPUPlace(0)) + exe.run(paddle.static.default_startup_program()) + x_data = np.random.randn(28, 28).astype('float32') + y_data = np.random.randint(0, 9, size=[28, 1]).astype('int64') + exe.run(paddle.static.default_main_program(), + feed={ + 'x': x_data, + 'y': y_data + }, + fetch_list=[avg_loss]) + paddle.static.save_inference_model(path_prefix, [x, y], [avg_loss], exe) + print('save model to', path_prefix) + + # step 2: prepare fake data for the inference + x_tensor = np.random.randn(28, 28).astype('float32') + y_tensor = np.random.randint(0, 9, size=[28, 1]).astype('int64') + + # step 3: init the dist model to inference with fake data + config = core.DistModelConfig() + config.model_dir = path_prefix + config.place = 'XPU' + dist = core.DistModel(config) + dist.init() + dist_x = core.DistModelTensor(x_tensor, 'x') + dist_y = core.DistModelTensor(y_tensor, 'y') + input_data = [dist_x, dist_y] + output_rst = dist.run(input_data) + dist_model_rst = output_rst[0].as_ndarray().ravel().tolist() + print("dist model rst:", dist_model_rst) + + # step 4: use framework's api to inference with fake data + [inference_program, feed_target_names, + fetch_targets] = (paddle.static.load_inference_model(path_prefix, exe)) + results = exe.run(inference_program, + feed={ + 'x': x_tensor, + 'y': y_tensor + }, + fetch_list=fetch_targets) + load_inference_model_rst = results[0] + print("load inference model api rst:", load_inference_model_rst) + + # step 5: compare two results + np.testing.assert_allclose(dist_model_rst, load_inference_model_rst) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py index 0b2470228b94a..4a1601ed99065 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py @@ -75,8 +75,12 @@ def verify_output(self, outs): hist2, _ = np.histogram(data, range=(-3, 5)) hist2 = hist2.astype("float32") hist2 /= float(outs[0].size) - self.assertTrue(np.allclose(hist, hist2, rtol=0, atol=0.01), - "hist: " + str(hist) + " hist2: " + str(hist2)) + np.testing.assert_allclose(hist, + hist2, + rtol=0, + atol=0.01, + err_msg="hist: " + str(hist) + + " hist2: " + str(hist2)) class TestMeanStdAreInt(TestGaussianRandomOp): diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py index 73f61c2d9d5ba..f7be0e61d8100 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py @@ -214,9 +214,10 @@ def test_out(self): expected_result = np.matmul(data1.reshape(1, 2), data2.reshape(2, 1)) - self.assertTrue( - np.allclose(np_res, expected_result, atol=1e-3), - "two value is\ + np.testing.assert_allclose(np_res, + expected_result, + atol=1e-3, + err_msg="two value is\ {}\n{}, check diff!".format(np_res, expected_result)) def test_dygraph_without_out(self): @@ -228,8 +229,9 @@ def test_dygraph_without_out(self): data2 = fluid.dygraph.to_variable(input_array2) out = paddle.mm(data1, data2) expected_result = np.matmul(input_array1, input_array2) - self.assertTrue( - np.allclose(expected_result, out.numpy(), atol=1e-3)) + np.testing.assert_allclose(expected_result, + out.numpy(), + atol=1e-3) class Test_API_Matmul(unittest.TestCase): @@ -244,8 +246,9 @@ def test_dygraph_without_out(self): self.in_type) out = paddle.matmul(data1, data2) expected_result = np.matmul(input_array1, input_array2) - self.assertTrue( - np.allclose(expected_result, out.numpy(), atol=1e-3)) + np.testing.assert_allclose(expected_result, + out.numpy(), + atol=1e-3) class API_TestMmError(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py index 370d7645a81fa..f5e8b491168f5 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py @@ -341,6 +341,24 @@ def init_exclusive(self): def init_adaptive(self): self.adaptive = False + class TestAvgPoolAdaptive(TestPool2D_Op): + + def init_adaptive(self): + self.adaptive = True + + class TestAvgPoolAdaptiveAsyOutSize(TestPool2D_Op): + + def init_adaptive(self): + self.adaptive = True + + def init_shape(self): + self.shape = [8, 3, 6, 6] + + def init_test_case(self): + self.ksize = [2, 3] + self.strides = [1, 1] + self.paddings = [0, 0, 0, 0] + class TestCase1(TestPool2D_Op): def init_test_case(self): diff --git a/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py index ebdebce268295..956100ea5690c 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py @@ -123,7 +123,7 @@ def test_api(self): exe = paddle.static.Executor(place=paddle.CPUPlace()) out = exe.run(main_prog, feed={"x": input}, fetch_list=[out]) - self.assertEqual(np.array_equal(out[0], input * 2.0 + 3.0), True) + np.testing.assert_array_equal(out[0], input * 2.0 + 3.0) class TestScaleInplaceApiStatic(TestScaleApiStatic): @@ -142,7 +142,7 @@ def test_api(self): input = np.random.random([2, 25]).astype("float32") x = paddle.to_tensor(input) out = self._executed_api(x, scale=2.0, bias=3.0) - self.assertEqual(np.array_equal(out.numpy(), input * 2.0 + 3.0), True) + np.testing.assert_array_equal(out.numpy(), input * 2.0 + 3.0) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py index 6996554d427fd..fefc5e24ee293 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -68,17 +68,10 @@ def test_check_output(self): def test_check_grad(self): place = paddle.XPUPlace(0) - if self.dtype in [np.float32, np.float64]: - self.check_grad_with_place(place, ['X'], 'Out') - elif self.dtype == np.bool_: + if self.dtype == np.bool_: return else: - user_defined_grad_outputs = np.random.random( - self.new_shape).astype(self.dtype) - self.check_grad_with_place( - place, ['X'], - 'Out', - user_defined_grad_outputs=user_defined_grad_outputs) + self.check_grad_with_place(place, ['X'], 'Out') # Correct: There is mins axis. class TestSqueeze2Op1(TestSqueeze2Op): diff --git a/python/paddle/fluid/tests/unittests/xpu/test_squeeze_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_squeeze_op_xpu.py index b766b6e3c002f..bfe91c19f0311 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_squeeze_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_squeeze_op_xpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,80 +22,90 @@ import paddle import paddle.fluid as fluid -from paddle.fluid import compiler, Program, program_guard +from paddle.fluid import Program, program_guard from op_test import OpTest from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() -# Correct: General. -class TestSqueezeOp(XPUOpTest): +class XPUTestSqueezeOp(XPUOpTestWrapper): - def setUp(self): - self.op_type = "squeeze" - self.use_xpu = True - self.use_mkldnn = False - self.init_test_case() - self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")} - self.init_attrs() - self.outputs = { - "Out": self.inputs["X"].reshape(self.new_shape), - } + def __init__(self): + self.op_name = "squeeze" + self.use_dynamic_create_class = False - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['X'], 'Out') + # Correct: General. + class TestSqueezeOp(XPUOpTest): - def init_test_case(self): - self.ori_shape = (1, 3, 1, 40) - self.axes = (0, 2) - self.new_shape = (3, 40) + def setUp(self): + self.op_type = "squeeze" + self.__class__.op_type = "squeeze" + self.use_mkldnn = False + self.init_dtype() + self.init_test_case() + self.inputs = { + "X": np.random.random(self.ori_shape).astype(self.dtype) + } + self.init_attrs() + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + } - def init_attrs(self): - self.attrs = {"axes": self.axes} + def init_dtype(self): + self.dtype = self.in_type + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) -# Correct: There is mins axis. -class TestSqueezeOp1(TestSqueezeOp): + def test_check_grad(self): + place = paddle.XPUPlace(0) + if self.dtype == np.bool_: + return + else: + self.check_grad_with_place(place, ['X'], 'Out') - def init_test_case(self): - self.ori_shape = (1, 3, 1, 40) - self.axes = (0, -2) - self.new_shape = (3, 40) + def init_test_case(self): + self.ori_shape = (1, 3, 1, 40) + self.axes = (0, 2) + self.new_shape = (3, 40) + def init_attrs(self): + self.attrs = {"axes": self.axes} -# Correct: No axes input. -class TestSqueezeOp2(TestSqueezeOp): + # Correct: There is mins axis. + class TestSqueezeOp1(TestSqueezeOp): - def init_test_case(self): - self.ori_shape = (1, 20, 1, 5) - self.axes = () - self.new_shape = (20, 5) + def init_test_case(self): + self.ori_shape = (1, 3, 1, 40) + self.axes = (0, -2) + self.new_shape = (3, 40) + # Correct: No axes input. + class TestSqueezeOp2(TestSqueezeOp): -# Correct: Just part of axes be squeezed. -class TestSqueezeOp3(TestSqueezeOp): + def init_test_case(self): + self.ori_shape = (1, 20, 1, 5) + self.axes = () + self.new_shape = (20, 5) - def init_test_case(self): - self.ori_shape = (6, 1, 5, 1, 4, 1) - self.axes = (1, -1) - self.new_shape = (6, 5, 1, 4) + # Correct: Just part of axes be squeezed. + class TestSqueezeOp3(TestSqueezeOp): + def init_test_case(self): + self.ori_shape = (6, 1, 5, 1, 4, 1) + self.axes = (1, -1) + self.new_shape = (6, 5, 1, 4) -# Correct: The demension of axis is not of size 1 remains unchanged. -class TestSqueezeOp4(TestSqueezeOp): + # Correct: The demension of axis is not of size 1 remains unchanged. + class TestSqueezeOp4(TestSqueezeOp): - def init_test_case(self): - self.ori_shape = (6, 1, 5, 1, 4, 1) - self.axes = (1, 2) - self.new_shape = (6, 5, 1, 4, 1) + def init_test_case(self): + self.ori_shape = (6, 1, 5, 1, 4, 1) + self.axes = (1, 2) + self.new_shape = (6, 5, 1, 4, 1) class TestSqueezeOpError(unittest.TestCase): @@ -115,5 +125,9 @@ def test_errors(self): self.assertRaises(TypeError, paddle.squeeze, x3, axes=0) +support_types = get_xpu_op_support_types("squeeze") +for stype in support_types: + create_test_class(globals(), XPUTestSqueezeOp, stype) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py index 8ba7f6818882a..6f3ce276113e4 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -69,17 +69,10 @@ def test_check_output(self): def test_check_grad(self): place = paddle.XPUPlace(0) - if self.dtype in [np.float32, np.float64, np.float16]: - self.check_grad_with_place(place, ['X'], 'Out') - elif self.dtype == np.bool_: + if self.dtype == np.bool_: return else: - user_defined_grad_outputs = np.random.random( - self.new_shape).astype(self.dtype) - self.check_grad_with_place( - place, ['X'], - 'Out', - user_defined_grad_outputs=user_defined_grad_outputs) + self.check_grad_with_place(place, ['X'], 'Out') # Correct: Single input index. class TestUnsqueeze2Op1(TestUnsqueeze2Op): diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze_op_xpu.py index 9e505fe08a647..7b7dd13374939 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze_op_xpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,76 +24,89 @@ import paddle.fluid as fluid from op_test import OpTest from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() # Correct: General. -class TestUnsqueezeOp(XPUOpTest): - - def setUp(self): - self.init_test_case() - self.op_type = "unsqueeze" - self.use_xpu = True - self.use_mkldnn = False - self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")} - self.init_attrs() - self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) +class XPUTestUnsqueezeOp(XPUOpTestWrapper): - def test_check_grad(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['X'], 'Out') + def __init__(self): + self.op_name = "unsqueeze" + self.use_dynamic_create_class = False + + class TestUnsqueezeOp(XPUOpTest): - def init_test_case(self): - self.ori_shape = (3, 40) - self.axes = (1, 2) - self.new_shape = (3, 1, 1, 40) + def setUp(self): + self.op_type = "unsqueeze" + self.__class__.op_type = "unsqueeze" + self.use_mkldnn = False + self.init_test_case() + self.inputs = { + "X": np.random.random(self.ori_shape).astype(self.dtype) + } + self.init_attrs() + self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} - def init_attrs(self): - self.attrs = {"axes": self.axes} + def init_dtype(self): + self.dtype = self.in_type + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = paddle.XPUPlace(0) + if self.dtype == np.bool_: + return + else: + self.check_grad_with_place(place, ['X'], 'Out') -# Correct: Single input index. -class TestUnsqueezeOp1(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (3, 40) + self.axes = (1, 2) + self.new_shape = (3, 1, 1, 40) - def init_test_case(self): - self.ori_shape = (20, 5) - self.axes = (-1, ) - self.new_shape = (20, 5, 1) + def init_attrs(self): + self.attrs = {"axes": self.axes} + # Correct: Single input index. + class TestUnsqueezeOp1(TestUnsqueezeOp): -# Correct: Mixed input axis. -class TestUnsqueezeOp2(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (20, 5) + self.axes = (-1, ) + self.new_shape = (20, 5, 1) - def init_test_case(self): - self.ori_shape = (20, 5) - self.axes = (0, -1) - self.new_shape = (1, 20, 5, 1) + # Correct: Mixed input axis. + class TestUnsqueezeOp2(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (20, 5) + self.axes = (0, -1) + self.new_shape = (1, 20, 5, 1) -# Correct: There is duplicated axis. -class TestUnsqueezeOp3(TestUnsqueezeOp): + # Correct: There is duplicated axis. + class TestUnsqueezeOp3(TestUnsqueezeOp): - def init_test_case(self): - self.ori_shape = (10, 2, 5) - self.axes = (0, 3, 3) - self.new_shape = (1, 10, 2, 1, 1, 5) + def init_test_case(self): + self.ori_shape = (10, 2, 5) + self.axes = (0, 3, 3) + self.new_shape = (1, 10, 2, 1, 1, 5) + # Correct: Reversed axes. + class TestUnsqueezeOp4(TestUnsqueezeOp): -# Correct: Reversed axes. -class TestUnsqueezeOp4(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (10, 2, 5) + self.axes = (3, 1, 1) + self.new_shape = (10, 1, 1, 2, 5, 1) - def init_test_case(self): - self.ori_shape = (10, 2, 5) - self.axes = (3, 1, 1) - self.new_shape = (10, 1, 1, 2, 5, 1) +support_types = get_xpu_op_support_types("unsqueeze") +for stype in support_types: + create_test_class(globals(), XPUTestUnsqueezeOp, stype) if __name__ == "__main__": unittest.main() diff --git a/python/paddle/geometric/__init__.py b/python/paddle/geometric/__init__.py new file mode 100644 index 0000000000000..9e59062a7cc6a --- /dev/null +++ b/python/paddle/geometric/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .message_passing import send_u_recv # noqa: F401 + +__all__ = [ + 'send_u_recv', +] diff --git a/python/paddle/geometric/message_passing/__init__.py b/python/paddle/geometric/message_passing/__init__.py new file mode 100644 index 0000000000000..d9580e658650a --- /dev/null +++ b/python/paddle/geometric/message_passing/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .send_recv import send_u_recv # noqa: F401 diff --git a/python/paddle/geometric/message_passing/send_recv.py b/python/paddle/geometric/message_passing/send_recv.py new file mode 100644 index 0000000000000..87379730a2a60 --- /dev/null +++ b/python/paddle/geometric/message_passing/send_recv.py @@ -0,0 +1,162 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.framework import _non_static_mode, _in_legacy_dygraph, in_dygraph_mode +from paddle.fluid.framework import Variable +from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype +from paddle import _C_ops + +from .utils import convert_out_size_to_list, get_out_size_tensor_inputs + + +def send_u_recv(x, + src_index, + dst_index, + pool_type="sum", + out_size=None, + name=None): + """ + + Graph Learning message passing api. + + This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory + consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index` + to gather the corresponding data, and then use `dst_index` to update the corresponding position of output tensor + in different pooling types, like sum, mean, max, or min. Besides, we can use `out_size` to set necessary output shape. + + .. code-block:: text + + Given: + + X = [[0, 2, 3], + [1, 4, 5], + [2, 6, 7]] + + src_index = [0, 1, 2, 0] + + dst_index = [1, 2, 1, 0] + + pool_type = "sum" + + out_size = None + + Then: + + Out = [[0, 2, 3], + [2, 8, 10], + [1, 4, 5]] + + Args: + x (Tensor): The input tensor, and the available data type is float32, float64, int32, int64. + src_index (Tensor): An 1-D tensor, and the available data type is int32, int64. + dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. + The available data type is int32, int64. + pool_type (str): Different pooling types, including `sum`, `mean`, `max`, `min`. + Default value is `sum`. + out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or + out_size is smaller or equal to 0, then this input will not be used. + Otherwise, `out_size` should be equal with or larger than + max(dst_index) + 1. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + out (Tensor): The output tensor, should have the same shape and same dtype as input tensor `x`. + If `out_size` is set correctly, then it should have the same shape as `x` except + the 0th dimension. + + Examples: + + .. code-block:: python + + import paddle + + x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") + indexes = paddle.to_tensor([[0, 1], [1, 2], [2, 1], [0, 0]], dtype="int32") + src_index = indexes[:, 0] + dst_index = indexes[:, 1] + out = paddle.geometric.send_u_recv(x, src_index, dst_index, pool_type="sum") + # Outputs: [[0., 2., 3.], [2., 8., 10.], [1., 4., 5.]] + + x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") + indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32") + src_index = indexes[:, 0] + dst_index = indexes[:, 1] + out_size = paddle.max(dst_index) + 1 + out = paddle.geometric.send_u_recv(x, src_index, dst_index, pool_type="sum", out_size=out_size) + # Outputs: [[0., 2., 3.], [[2., 8., 10.]]] + + x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") + indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32") + src_index = indexes[:, 0] + dst_index = indexes[:, 1] + out = paddle.geometric.send_u_recv(x, src_index, dst_index, pool_type="sum") + # Outputs: [[0., 2., 3.], [2., 8., 10.], [0., 0., 0.]] + + """ + + if pool_type not in ["sum", "mean", "max", "min"]: + raise ValueError( + "pool_type should be `sum`, `mean`, `max` or `min`, but received %s" + % pool_type) + + # TODO(daisiming): Should we add judgement for out_size: max(dst_index) + 1. + + if _in_legacy_dygraph(): + out_size = convert_out_size_to_list(out_size) + out, tmp = _C_ops.graph_send_recv(x, src_index, + dst_index, None, 'pool_type', + pool_type.upper(), 'out_size', + out_size) + return out + if in_dygraph_mode(): + out_size = convert_out_size_to_list(out_size) + return _C_ops.final_state_graph_send_recv(x, src_index, dst_index, + pool_type.upper(), out_size) + + check_variable_and_dtype(x, "X", ("float32", "float64", "int32", "int64"), + "graph_send_recv") + check_variable_and_dtype(src_index, "Src_index", ("int32", "int64"), + "graph_send_recv") + check_variable_and_dtype(dst_index, "Dst_index", ("int32", "int64"), + "graph_send_recv") + if out_size: + check_type(out_size, 'out_size', (int, np.int32, np.int64, Variable), + 'graph_send_recv') + if isinstance(out_size, Variable): + check_dtype(out_size.dtype, 'out_size', ['int32', 'int64'], + 'graph_send_recv') + + helper = LayerHelper("send_u_recv", **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + dst_count = helper.create_variable_for_type_inference(dtype="int32", + stop_gradient=True) + + inputs = {"X": x, "Src_index": src_index, "Dst_index": dst_index} + attrs = {"pool_type": pool_type.upper()} + get_out_size_tensor_inputs(inputs=inputs, + attrs=attrs, + out_size=out_size, + op_type='graph_send_recv') + + helper.append_op(type="graph_send_recv", + inputs=inputs, + outputs={ + "Out": out, + "Dst_count": dst_count + }, + attrs=attrs) + return out diff --git a/python/paddle/geometric/message_passing/utils.py b/python/paddle/geometric/message_passing/utils.py new file mode 100644 index 0000000000000..3614f829daf52 --- /dev/null +++ b/python/paddle/geometric/message_passing/utils.py @@ -0,0 +1,52 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from paddle.fluid.framework import Variable +from paddle.fluid.data_feeder import check_dtype, convert_dtype +from paddle.fluid.layers.tensor import cast + + +def convert_out_size_to_list(out_size): + """ + Convert out_size(int, np.int32, np.int64, Variable) to list + in imperative mode. + """ + if out_size is None: + out_size = [0] + elif isinstance(out_size, (int, np.int32, np.int64)): + out_size = [out_size] + else: + out_size = [out_size.numpy().astype(int)[0]] + return out_size + + +def get_out_size_tensor_inputs(inputs, attrs, out_size, op_type): + """ + Convert out_size(int, np.int32, np.int64, Variable) to inputs + and attrs in static mode. + """ + if out_size is None: + attrs['out_size'] = [0] + elif isinstance(out_size, (int, np.int32, np.int64)): + attrs['out_size'] = [out_size] + elif isinstance(out_size, Variable): + out_size.stop_gradient = True + check_dtype(out_size.dtype, 'out_size', ['int32', 'int64'], 'op_type', + '(When type of out_size in' + op_type + ' is Variable.)') + if (convert_dtype(out_size.dtype) == 'int64'): + out_size = cast(out_size, 'int32') + inputs["Out_size"] = out_size + else: + raise TypeError("Out_size only supports Variable or int.") diff --git a/python/paddle/incubate/autograd/primops.py b/python/paddle/incubate/autograd/primops.py index f2313dfc2e8c4..c8b8a54df60e7 100644 --- a/python/paddle/incubate/autograd/primops.py +++ b/python/paddle/incubate/autograd/primops.py @@ -137,6 +137,11 @@ def exp(x, out=None): return _simple_unop(LayerHelper('exp_p', **locals())) +@REGISTER_FN('log_p', 'X', 'Y') +def log(x, out=None): + return _simple_unop(LayerHelper('log_p', **locals())) + + @REGISTER_FN('reshape_p', 'X', 'Y') def reshape(x, shape, out=None): return _manipulation_unop(LayerHelper('reshape_p', **locals())) diff --git a/python/paddle/incubate/autograd/primrules.py b/python/paddle/incubate/autograd/primrules.py index 56dffe932fa5d..f6f32c3237556 100644 --- a/python/paddle/incubate/autograd/primrules.py +++ b/python/paddle/incubate/autograd/primrules.py @@ -18,7 +18,7 @@ from .primops import (add, broadcast, concat, cos, div, exp, fill_const, gather, matmul, mul, neg, reduce, reshape, scatter_add, set_value, sin, slice_assign, slice_select, split, sqrt, sub, tanh, - transpose) + transpose, log) from .primreg import (REGISTER_JVP, REGISTER_ORIG2PRIM, REGISTER_PRIM2ORIG, REGISTER_TRANSPOSE, lookup_fn, lookup_jvp, lookup_orig2prim, lookup_prim2orig, lookup_transpose, @@ -166,6 +166,11 @@ def exp_orig2prim(op, x): return exp(x) +@REGISTER_ORIG2PRIM('log') +def log_orig2prim(op, x): + return log(x) + + @REGISTER_ORIG2PRIM('fill_zeros_like') def fill_zeros_like_orig2prim(op, x): return fill_const(value=0.0, shape=x.shape, dtype=x.dtype) @@ -333,6 +338,11 @@ def exp_prim2orig(op, x): return paddle.exp(x) +@REGISTER_PRIM2ORIG('log_p') +def log_prim2orig(op, x): + return paddle.log(x) + + @REGISTER_PRIM2ORIG('reshape_p') def reshape_prim2orig(op, x): return paddle.reshape(x, shape=op.attr('shape')) @@ -509,6 +519,14 @@ def exp_jvp(op, x_dot): return mul(x_dot, y) +@REGISTER_JVP('log_p') +def log_jvp(op, x_dot): + if x_dot is None: + return None + x, = op_position_inputs(op) + return div(x_dot, x) + + @REGISTER_JVP('reshape_p') def reshape_jvp(op, x_dot): if x_dot is None: diff --git a/python/paddle/incubate/autotune.py b/python/paddle/incubate/autotune.py index db7f881e4cf68..4c577cba3e70c 100644 --- a/python/paddle/incubate/autotune.py +++ b/python/paddle/incubate/autotune.py @@ -54,7 +54,6 @@ def set_config(config=None): Examples: .. code-block:: python - :name: auto-tuning import paddle import json diff --git a/python/paddle/incubate/operators/graph_send_recv.py b/python/paddle/incubate/operators/graph_send_recv.py index e9937558e9b3a..132a6d4657ca1 100644 --- a/python/paddle/incubate/operators/graph_send_recv.py +++ b/python/paddle/incubate/operators/graph_send_recv.py @@ -12,13 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. +import numpy as np from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.framework import _non_static_mode, _in_legacy_dygraph, in_dygraph_mode -from paddle.fluid.data_feeder import check_variable_and_dtype -from paddle.fluid import core +from paddle.fluid.framework import Variable +from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype +from paddle.fluid.layers.tensor import cast from paddle import _C_ops +import paddle.utils.deprecated as deprecated +@deprecated( + since="2.4.0", + update_to="paddle.geometric.send_u_recv", + level=1, + reason="graph_send_recv in paddle.incubate will be removed in future") def graph_send_recv(x, src_index, dst_index, @@ -63,14 +71,17 @@ def graph_send_recv(x, The available data type is int32, int64. pool_type (str): The pooling type of graph_send_recv, including `sum`, `mean`, `max`, `min`. Default value is `sum`. - out_size (int64|None): We can set `out_size` to get necessary output shape. If not set, then this - attribute will not be used. If set, it should be equal with or larger than - max(dst_index) + 1. + out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or + out_size is smaller or equal to 0, then this input will not be used. + Otherwise, `out_size` should be equal with or larger than + max(dst_index) + 1. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: - out (Tensor): The output tensor, should have the same shape and same dtype as input tensor `x`. + out (Tensor): The output tensor, should have the same shape and same dtype as input tensor `x`. + If `out_size` is set correctly, then it should have the same shape as `x` except + the 0th dimension. Examples: @@ -109,31 +120,17 @@ def graph_send_recv(x, # TODO(daisiming): Should we add judgement for out_size: max(dst_index) + 1. - if out_size is None or out_size <= 0: - if _in_legacy_dygraph(): - out, tmp = _C_ops.graph_send_recv(x, src_index, dst_index, - 'pool_type', pool_type.upper()) - return out - if in_dygraph_mode(): - return _C_ops.final_state_graph_send_recv(x, src_index, dst_index, - pool_type.upper(), 0) - else: - if _in_legacy_dygraph(): - out, tmp = _C_ops.graph_send_recv(x, src_index, - dst_index, 'pool_type', - pool_type.upper(), 'out_size', - out_size) - return out - if in_dygraph_mode(): - if isinstance(out_size, core.eager.Tensor): - if (out_size.size < 1): - raise ValueError( - "out_size should be long type, but received Tensor type." - ) - out_size = out_size.numpy()[0] - return _C_ops.final_state_graph_send_recv(x, src_index, dst_index, - pool_type.upper(), - out_size) + if _in_legacy_dygraph(): + out_size = convert_out_size_to_list(out_size) + out, tmp = _C_ops.graph_send_recv(x, src_index, + dst_index, None, 'pool_type', + pool_type.upper(), 'out_size', + out_size) + return out + if in_dygraph_mode(): + out_size = convert_out_size_to_list(out_size) + return _C_ops.final_state_graph_send_recv(x, src_index, dst_index, + pool_type.upper(), out_size) check_variable_and_dtype(x, "X", ("float32", "float64", "int32", "int64"), "graph_send_recv") @@ -141,25 +138,64 @@ def graph_send_recv(x, "graph_send_recv") check_variable_and_dtype(dst_index, "Dst_index", ("int32", "int64"), "graph_send_recv") + if out_size: + check_type(out_size, 'out_size', (int, np.int32, np.int64, Variable), + 'graph_send_recv') + if isinstance(out_size, Variable): + check_dtype(out_size.dtype, 'out_size', ['int32', 'int64'], + 'graph_send_recv') helper = LayerHelper("graph_send_recv", **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) dst_count = helper.create_variable_for_type_inference(dtype="int32", stop_gradient=True) + + inputs = {"X": x, "Src_index": src_index, "Dst_index": dst_index} + attrs = {"pool_type": pool_type.upper()} + get_out_size_tensor_inputs(inputs=inputs, + attrs=attrs, + out_size=out_size, + op_type='graph_send_recv') + helper.append_op(type="graph_send_recv", - inputs={ - "X": x, - "Src_index": src_index, - "Dst_index": dst_index - }, + inputs=inputs, outputs={ "Out": out, "Dst_count": dst_count }, - attrs={ - "pool_type": - pool_type.upper(), - "out_size": - 0 if out_size is None or out_size <= 0 else out_size - }) + attrs=attrs) return out + + +def convert_out_size_to_list(out_size): + """ + Convert out_size(int, np.int32, np.int64, Variable) to list + in imperative mode. + """ + if out_size is None: + out_size = [0] + elif isinstance(out_size, (int, np.int32, np.int64)): + out_size = [out_size] + else: + out_size = [out_size.numpy().astype(int)[0]] + return out_size + + +def get_out_size_tensor_inputs(inputs, attrs, out_size, op_type): + """ + Convert out_size(int, np.int32, np.int64, Variable) to inputs + and attrs in static mode. + """ + if out_size is None: + attrs['out_size'] = [0] + elif isinstance(out_size, (int, np.int32, np.int64)): + attrs['out_size'] = [out_size] + elif isinstance(out_size, Variable): + out_size.stop_gradient = True + check_dtype(out_size.dtype, 'out_size', ['int32', 'int64'], op_type, + '(When type of out_size in' + op_type + ' is Variable.)') + if (convert_dtype(out_size.dtype) == 'int64'): + out_size = cast(out_size, 'int32') + inputs["Out_size"] = out_size + else: + raise TypeError("Out_size only supports Variable or int.") diff --git a/python/paddle/jit/layer.py b/python/paddle/jit/layer.py index 4aee7a8f5c02a..97b598948500b 100644 --- a/python/paddle/jit/layer.py +++ b/python/paddle/jit/layer.py @@ -26,18 +26,19 @@ def __init__(self): def load(self, load_path, place): self.cpp_layer = Load(load_path, place) - function_dict = self.cpp_layer.function_dict() - for name, function in function_dict.items(): - self.functions[name] = Function(function) + for name in self.cpp_layer.function_names(): + function = self.cpp_layer.function(name) + info = self.cpp_layer.function_info(name) + self.functions[name] = Function(function, info) setattr(self, name, self.functions[name]) class Function(): - def __init__(self, function): + def __init__(self, function, info): self.function = function - self.info = FunctionInfo(function.info()) + self.info = FunctionInfo(info) def __call__(self, *args): return core.eager.jit_function_call(self.function, args) diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index 8281cf37c4f67..74fa0e70c72f0 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -601,7 +601,6 @@ def rrelu(x, lower=1. / 8., upper=1. / 3., training=True, name=None): Examples: .. code-block:: python - :name: rrelu-example import paddle import paddle.nn.functional as F diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py index b5e34199aafbf..7f381a884689a 100644 --- a/python/paddle/nn/functional/common.py +++ b/python/paddle/nn/functional/common.py @@ -44,7 +44,7 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None): r""" - This op returns a col buffer of sliding local blocks of input x, also known + Return a col buffer of sliding local blocks of input x, also known as im2col for batched 2D image tensors. For each block under the convolution filter, all element will be rearranged as a column. While the convolution filter sliding over the input feature map, a series of such columns will be formed. @@ -91,15 +91,12 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None): Returns: - The tensor corresponding to the sliding local blocks. + Tensor, The tensor corresponding to the sliding local blocks. The output shape is [N, Cout, Lout] as decriabled above. Cout is the total number of values within each block, and Lout is the total number of such blocks. The data type of output is the same as the input :math:`x` - Return Type: - Tensor - Examples: .. code-block:: python @@ -1007,14 +1004,14 @@ def dropout(x, print(y_01) """ - # fast return for p == 0 - if p == 0: - return x - - if not isinstance(p, (float, int)): - raise TypeError("p argument should be a number") - if p < 0 or p > 1: - raise ValueError("p argument should between 0 and 1") + if not isinstance(p, (float, int, Variable)): + raise TypeError("p argument should be a number or Variable") + + if isinstance(p, (int, float)): + # fast return for p == 0 + if p == 0: return x + elif p < 0 or p > 1: + raise ValueError("p argument should between 0 and 1") if mode not in ('downscale_in_infer', 'upscale_in_train'): raise ValueError( "mode argument should be 'downscale_in_infer' or 'upscale_in_train'" @@ -1053,6 +1050,12 @@ def dropout(x, def get_attrs(prog, dropout_prob, is_test, seed): if (seed is None or seed == 0) and prog.random_seed != 0: seed = prog.random_seed + + if isinstance(dropout_prob, + Variable) and not dropout_prob.shape != [1]: + raise TypeError( + "Required p.shape == [1] if type(p) is Variable, but received p.shape = {}" + .format(p.shape)) attrs = { 'dropout_prob': dropout_prob, 'is_test': is_test, @@ -1321,21 +1324,21 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None): pad_right). 2. If the input dimension is 4, then the pad has the form (pad_left, pad_right, pad_top, pad_bottom). 3. If the input dimension is 5, then the pad has the form (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back). - mode (str): Four modes: 'constant' (default), 'reflect', 'replicate', 'circular'. + mode (str, optional): Four modes: 'constant' (default), 'reflect', 'replicate', 'circular'. When in 'constant' mode, this op uses a constant value to pad the input tensor. When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor. When in 'replicate' mode, uses input boundaries to pad the input tensor. When in 'circular' mode, uses circular input to pad the input tensor. Default is 'constant' - value (float32): The value to fill the padded areas in 'constant' mode . Default is 0.0 - data_format (str): An string from: "NCL", "NLC", NHWC", "NCHW", "NCDHW", "NDHWC". Specify the data format of + value (float32, optional): The value to fill the padded areas in 'constant' mode . Default is 0.0 + data_format (str, optional): An string from: "NCL", "NLC", NHWC", "NCHW", "NCDHW", "NDHWC". Specify the data format of the input data. Default is "NCHW" name (str, optional) : The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. - Returns: a Tensor padded according to pad and mode and data type is same as input. - Return Type: Tensor + Returns: + Tensor, a Tensor padded according to pad and mode and data type is same as input. Examples: .. code-block:: text @@ -1549,12 +1552,13 @@ def zeropad2d(x, padding, data_format="NCHW", name=None): padding(int | Tensor | List[int] | Tuple[int]): The padding size with data type int. The input dimension should be 4 and pad has the form (pad_left, pad_right, pad_top, pad_bottom). - data_format(str): An string from: "NHWC", "NCHW". Specify the data format of + data_format(str, optional): An string from: "NHWC", "NCHW". Specify the data format of the input data. Default: "NCHW". name(str, optional): The default value is None. Normally there is no need for user to set this property. - Returns:Tensor,padded with 0 according to pad and data type is same as input. + Returns: + Tensor, padded with 0 according to pad and data type is same as input. Examples: .. code-block:: python @@ -1587,11 +1591,11 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8): Parameters: x1 (Tensor): First input. float32/double. x2 (Tensor): Second input. float32/double. - axis (int): Dimension of vectors to compute cosine similarity. Default is 1. - eps(float): Small value to avoid division by zero. Default is 1e-8. + axis (int, optional): Dimension of vectors to compute cosine similarity. Default is 1. + eps(float, optional): Small value to avoid division by zero. Default is 1e-8. - Returns: a Tensor representing cosine similarity between x1 and x2 along axis. - Return Type: Tensor + Returns: + Tensor, a Tensor representing cosine similarity between x1 and x2 along axis. Examples: .. code-block:: text @@ -1614,16 +1618,14 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8): import paddle import paddle.nn as nn - import numpy as np - np.random.seed(0) - x1 = np.random.rand(2,3) - x2 = np.random.rand(2,3) - x1 = paddle.to_tensor(x1) - x2 = paddle.to_tensor(x2) + paddle.seed(1) + x1 = paddle.randn(shape=[2, 3]) + x2 = paddle.randn(shape=[2, 3]) + result = paddle.nn.functional.cosine_similarity(x1, x2, axis=0) print(result) - # [0.99806249 0.9817672 0.94987036] + # [0.97689527, 0.99996042, -0.55138415] """ w12 = sum(paddle.multiply(x1, x2), axis=axis) @@ -1956,7 +1958,11 @@ class centers and the shape of sampled_class_center will be [num_positive_class_ if (seed is None or seed == 0) and default_main_program().random_seed != 0: seed = default_main_program().random_seed - if in_dynamic_mode(): + if in_dygraph_mode(): + return _C_ops.final_state_class_center_sample( + label, num_classes, num_samples, ring_id, rank, nranks, seed + is not None, seed if seed is not None else 0) + elif paddle.in_dynamic_mode(): remapped_label, sampled_class_center = _C_ops.class_center_sample( label, 'num_classes', num_classes, 'num_samples', num_samples, 'ring_id', ring_id, 'nranks', nranks, 'rank', rank, 'fix_seed', seed @@ -2103,7 +2109,10 @@ def _is_list_or_turple_(data): "Unexpected type of paddings, it should be either an integer or a list" "of 2 or 4 integers") - if in_dynamic_mode(): + if in_dygraph_mode(): + out = _C_ops.final_state_fold(x, output_sizes, kernel_sizes, strides, + paddings, dilations) + elif in_dynamic_mode(): out = _C_ops.fold(x, "output_sizes", output_sizes, "kernel_sizes", kernel_sizes, "strides", strides, "paddings", paddings, "dilations", dilations) diff --git a/python/paddle/nn/functional/distance.py b/python/paddle/nn/functional/distance.py index 8c672ffc69fd2..4d6f447d6737f 100644 --- a/python/paddle/nn/functional/distance.py +++ b/python/paddle/nn/functional/distance.py @@ -67,12 +67,12 @@ def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None): check_type(epsilon, 'epsilon', (float), 'PairwiseDistance') check_type(keepdim, 'keepdim', (bool), 'PairwiseDistance') if in_dygraph_mode(): - sub = _C_ops.elementwise_sub(x, y) + sub = _C_ops.final_state_subtract(x, y) # p_norm op has not uesd epsilon, so change it to the following. if epsilon != 0.0: epsilon = paddle.fluid.dygraph.base.to_variable([epsilon], dtype=sub.dtype) - sub = _C_ops.elementwise_add(sub, epsilon) + sub = _C_ops.final_state_add(sub, epsilon) return _C_ops.final_state_p_norm(sub, p, -1, 0., keepdim, False) if _in_legacy_dygraph(): diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py index 6191d015e2a20..995ba19058842 100644 --- a/python/paddle/nn/functional/extension.py +++ b/python/paddle/nn/functional/extension.py @@ -191,12 +191,11 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None): to :ref:`api_guide_Name`. Usually name is no need to set and \ None by default. - Returns: The output sequence mask. Tensor with shape [d_1, d_2, ..., d_n, maxlen] \ + Returns: + Tensor, The output sequence mask. Tensor with shape [d_1, d_2, ..., d_n, maxlen] \ and data type of :code:`dtype`. The data type should be bool, float32, float64, int8, \ int32 or int64. - Return Type: Tensor - Examples: .. code-block:: python diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index 4e568a571edac..7e73f3ce8ac4a 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -396,10 +396,8 @@ def square_error_cost(input, label): label (Tensor): Label tensor, the data type should be float32. Returns: - The tensor storing the element-wise squared error \ - difference between input and label. - - Return type: Tensor. + Tensor, The tensor storing the element-wise squared error + difference between input and label. Examples: @@ -981,7 +979,7 @@ def hsigmoid_loss(input, def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None): r""" - This operator calculates smooth_l1_loss. Creates a criterion that uses a squared + Calculate smooth_l1_loss. Creates a criterion that uses a squared term if the absolute element-wise error falls below 1 and an L1 term otherwise. In some cases it can prevent exploding gradients and it is more robust and less sensitivity to outliers. Also known as the Huber loss: @@ -1020,9 +1018,7 @@ def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None): None). For more information, please refer to :ref:`api_guide_Name`. Returns: - The tensor variable storing the smooth_l1_loss of input and label. - - Return type: Tensor. + Tensor, The tensor variable storing the smooth_l1_loss of input and label. Examples: .. code-block:: python @@ -1081,7 +1077,7 @@ def margin_ranking_loss(input, name=None): r""" - This op the calcluate the margin rank loss between the input, other and label, use the math function as follows. + Calcluate the margin rank loss between the input, other and label, use the math function as follows. .. math:: margin\_rank\_loss = max(0, -label * (input - other) + margin) @@ -1106,7 +1102,8 @@ def margin_ranking_loss(input, reduction (str, optional): Indicate the reduction to apply to the loss, the candicates are ``'none'``, ``'mean'``, ``'sum'``.If :attr:`reduction` is ``'none'``, the unreduced loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned. Default is ``'mean'``. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. - Returns: Tensor, if :attr:`reduction` is ``'mean'`` or ``'sum'``, the out shape is :math:`[1]`, otherwise the shape is the same as `input` .The same dtype as input tensor. + Returns: + Tensor, if :attr:`reduction` is ``'mean'`` or ``'sum'``, the out shape is :math:`[1]`, otherwise the shape is the same as `input` .The same dtype as input tensor. Examples: @@ -1540,7 +1537,7 @@ def kl_div(input, label, reduction='mean', name=None): def mse_loss(input, label, reduction='mean', name=None): r""" - This op accepts input predications and label and returns the mean square error. + Accept input predications and label and returns the mean square error. If :attr:`reduction` is set to ``'none'``, loss is calculated as: @@ -1570,9 +1567,7 @@ def mse_loss(input, label, reduction='mean', name=None): Returns: - Tensor: The tensor tensor storing the mean square error difference of input and label. - - Return type: Tensor. + Tensor, The tensor tensor storing the mean square error difference of input and label. Examples: @@ -1931,7 +1926,19 @@ def margin_cross_entropy(logits, if input_dims - 1 == label_dims: label = paddle.unsqueeze(label, axis=-1) - if in_dynamic_mode(): + if in_dygraph_mode(): + softmax, loss = _C_ops.final_state_margin_cross_entropy( + logits, label, return_softmax, ring_id, rank, nranks, margin1, + margin2, margin3, scale) + if reduction == 'mean': + loss = paddle.mean(loss) + elif reduction == 'sum': + loss = paddle.sum(loss) + if not return_softmax: + return loss + else: + return loss, softmax + elif paddle.in_dynamic_mode(): softmax, loss = _C_ops.margin_cross_entropy( logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks, 'margin1', margin1, 'margin2', margin2, 'margin3', margin3, 'scale', @@ -2110,7 +2117,7 @@ def cross_entropy(input, Return the average value of the previous results - .. math:: + .. math:: \\loss=\sum_{j}loss_j/N where, N is the number of samples and C is the number of categories. @@ -2119,21 +2126,21 @@ def cross_entropy(input, 1. Hard labels (soft_label = False) - .. math:: + .. math:: \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j] 2. Soft labels (soft_label = True) - .. math:: + .. math:: \\loss=\sum_{j}loss_j/\sum_{j}\left(\sum_{i}weight[label_i]\right) - - + + Parameters: - **input** (Tensor) Input tensor, the data type is float32, float64. Shape is - :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` . + :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` . Note: @@ -2141,7 +2148,7 @@ def cross_entropy(input, output of softmax operator, which will produce incorrect results. 2. when use_softmax=False, it expects the output of softmax operator. - + - **label** (Tensor) 1. If soft_label=False, the shape is @@ -2209,10 +2216,11 @@ def cross_entropy(input, 2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . - Example1(hard labels): + Examples: .. code-block:: python - + + # hard labels import paddle paddle.seed(99999) N=100 @@ -2229,11 +2237,9 @@ def cross_entropy(input, label) print(dy_ret.numpy()) #[5.41993642] - - Example2(soft labels): - .. code-block:: python - + + # soft labels import paddle paddle.seed(99999) axis = -1 @@ -2900,7 +2906,6 @@ def cosine_embedding_loss(input1, Examples: .. code-block:: python - :name: code-example1 import paddle diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py index 4bb53e1737bf8..766a1ca3aaf64 100755 --- a/python/paddle/nn/functional/pooling.py +++ b/python/paddle/nn/functional/pooling.py @@ -782,7 +782,11 @@ def max_unpool1d(x, output_size = _unpool_output_size(x, kernel_size, stride, padding, output_size) - if in_dynamic_mode(): + if in_dygraph_mode(): + output = _C_ops.final_state_unpool(x, indices, kernel_size, stride, + padding, output_size, data_format) + return squeeze(output, [2]) + elif in_dynamic_mode(): output = _C_ops.unpool(x, indices, 'unpooling_type', 'max', 'ksize', kernel_size, 'strides', stride, 'paddings', padding, "output_size", output_size, @@ -838,7 +842,6 @@ def max_unpool2d(x, it must contain an integer. stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list, it must contain an integer. - kernel_size (int|tuple): Size of the max unpooling window. padding (int | tuple): Padding that was added to the input. output_size(list|tuple, optional): The target output size. If output_size is not specified, the actual output shape will be automatically calculated by (input_shape, @@ -898,7 +901,11 @@ def max_unpool2d(x, output_size = _unpool_output_size(x, kernel_size, stride, padding, output_size) - if in_dynamic_mode(): + if in_dygraph_mode(): + output = _C_ops.final_state_unpool(x, indices, kernel_size, stride, + padding, output_size, data_format) + + elif in_dynamic_mode(): output = _C_ops.unpool(x, indices, 'unpooling_type', 'max', 'ksize', kernel_size, 'strides', stride, 'paddings', padding, "output_size", output_size, @@ -1011,7 +1018,10 @@ def max_unpool3d(x, output_size = _unpool_output_size(x, kernel_size, stride, padding, output_size) - if in_dynamic_mode(): + if in_dygraph_mode(): + output = _C_ops.final_state_unpool3d(x, indices, kernel_size, stride, + padding, output_size, data_format) + elif in_dynamic_mode(): output = _C_ops.unpool3d(x, indices, 'unpooling_type', 'max', 'ksize', kernel_size, 'strides', stride, 'paddings', padding, "output_size", output_size, @@ -1286,26 +1296,25 @@ def adaptive_avg_pool1d(x, output_size, name=None): Tensor: The result of 1D adaptive average pooling. Its data type is same as input. Examples: .. code-block:: python - :name: adaptive_avg_pool1d-example - # average adaptive pool1d - # suppose input data in shape of [N, C, L], `output_size` is m or [m], - # output shape is [N, C, m], adaptive pool divide L dimension - # of input data into m grids averagely and performs poolings in each - # grid to get output. - # adaptive max pool performs calculations as follow: - # - # for i in range(m): - # lstart = floor(i * L / m) - # lend = ceil((i + 1) * L / m) - # output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend) - # - import paddle - import paddle.nn.functional as F + # average adaptive pool1d + # suppose input data in shape of [N, C, L], `output_size` is m or [m], + # output shape is [N, C, m], adaptive pool divide L dimension + # of input data into m grids averagely and performs poolings in each + # grid to get output. + # adaptive max pool performs calculations as follow: + # + # for i in range(m): + # lstart = floor(i * L / m) + # lend = ceil((i + 1) * L / m) + # output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend) + # + import paddle + import paddle.nn.functional as F - data = paddle.uniform([1, 3, 32]) - pool_out = F.adaptive_avg_pool1d(data, output_size=16) - # pool_out shape: [1, 3, 16]) + data = paddle.uniform([1, 3, 32]) + pool_out = F.adaptive_avg_pool1d(data, output_size=16) + # pool_out shape: [1, 3, 16]) """ pool_type = 'avg' if not in_dynamic_mode(): diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py index 0e06612bbb716..7515ee66dc9ff 100644 --- a/python/paddle/nn/functional/vision.py +++ b/python/paddle/nn/functional/vision.py @@ -367,7 +367,6 @@ def pixel_unshuffle(x, downscale_factor, data_format="NCHW", name=None): Examples: .. code-block:: python - :name: pixel_unshuffle-example import paddle import paddle.nn.functional as F @@ -424,7 +423,6 @@ def channel_shuffle(x, groups, data_format="NCHW", name=None): Examples: .. code-block:: python - :name: channel_shuffle-example import paddle import paddle.nn.functional as F diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py index 66818dab451a3..5cc5e60c2e0a3 100644 --- a/python/paddle/nn/initializer/constant.py +++ b/python/paddle/nn/initializer/constant.py @@ -26,7 +26,7 @@ class Constant(ConstantInitializer): Examples: .. code-block:: python - :name: code-example1 + import paddle import paddle.nn as nn diff --git a/python/paddle/nn/initializer/normal.py b/python/paddle/nn/initializer/normal.py index 2d0cd77ee17e9..730c55ea6f1b8 100644 --- a/python/paddle/nn/initializer/normal.py +++ b/python/paddle/nn/initializer/normal.py @@ -72,7 +72,6 @@ class TruncatedNormal(TruncatedNormalInitializer): Examples: .. code-block:: python - :name: initializer_TruncatedNormal-example import paddle diff --git a/python/paddle/nn/initializer/uniform.py b/python/paddle/nn/initializer/uniform.py index ee9b36ecf7c7b..707d3d03ecc13 100644 --- a/python/paddle/nn/initializer/uniform.py +++ b/python/paddle/nn/initializer/uniform.py @@ -30,7 +30,6 @@ class Uniform(UniformInitializer): Examples: .. code-block:: python - :name: initializer_Uniform-example import paddle diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py index d6570f9db2fe5..2c6d60a6bb86b 100644 --- a/python/paddle/nn/initializer/xavier.py +++ b/python/paddle/nn/initializer/xavier.py @@ -41,7 +41,6 @@ class XavierNormal(XavierInitializer): Examples: .. code-block:: python - :name: initializer_XavierNormal-example import paddle @@ -97,7 +96,6 @@ class XavierUniform(XavierInitializer): Examples: .. code-block:: python - :name: initializer_XavierUniform-example import paddle diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py index 6e2a11c89cc64..c03864a19d58c 100644 --- a/python/paddle/nn/layer/activation.py +++ b/python/paddle/nn/layer/activation.py @@ -486,7 +486,6 @@ class RReLU(Layer): Examples: .. code-block:: python - :name: RReLU-example import paddle diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py index 54ef832d73179..635315b0027e1 100644 --- a/python/paddle/nn/layer/loss.py +++ b/python/paddle/nn/layer/loss.py @@ -295,7 +295,7 @@ class CrossEntropyLoss(Layer): - **input** (Tensor) Input tensor, the data type is float32, float64. Shape is - :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` . + :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` . Note: @@ -303,7 +303,7 @@ class CrossEntropyLoss(Layer): output of softmax operator, which will produce incorrect results. 2. when use_softmax=False, it expects the output of softmax operator. - + - **label** (Tensor) @@ -313,7 +313,7 @@ class CrossEntropyLoss(Layer): 2. If soft_label=True, the shape and data type should be same with ``input`` , and the sum of the labels for each sample should be 1. - + - **output** (Tensor) Return the softmax cross_entropy loss of ``input`` and ``label``. @@ -328,10 +328,11 @@ class CrossEntropyLoss(Layer): 2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . - Example1(hard labels): + Examples: .. code-block:: python + # hard labels import paddle paddle.seed(99999) N=100 @@ -348,11 +349,9 @@ class CrossEntropyLoss(Layer): label) print(dy_ret.numpy()) #[5.41993642] - - Example2(soft labels): - .. code-block:: python - + + # soft labels import paddle paddle.seed(99999) axis = -1 @@ -1178,16 +1177,16 @@ class SmoothL1Loss(Layer): None). For more information, please refer to :ref:`api_guide_Name`. Call Parameters: - input (Tensor): Input tensor, the data type is float32 or float64. Shape is - (N, C), where C is number of classes, and if shape is more than 2D, this - is (N, C, D1, D2,..., Dk), k >= 1. - label (Tensor): Label tensor, the data type is float32 or float64. The shape of label - is the same as the shape of input. - Returns: - The tensor storing the smooth_l1_loss of input and label. + input (Tensor): Input tensor, the data type is float32 or float64. Shape is (N, C), + where C is number of classes, and if shape is more than 2D, + this is (N, C, D1, D2,..., Dk), k >= 1. - Return type: Tensor. + label (Tensor): Label tensor, the data type is float32 or float64. + The shape of label is the same as the shape of input. + + Returns: + Tensor, The tensor storing the smooth_l1_loss of input and label. Examples: .. code-block:: python @@ -1435,7 +1434,6 @@ class CosineEmbeddingLoss(Layer): Examples: .. code-block:: python - :name: code-example1 import paddle diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py index e7b6fc24afad8..46ae56a463ea3 100755 --- a/python/paddle/nn/layer/pooling.py +++ b/python/paddle/nn/layer/pooling.py @@ -644,7 +644,7 @@ class AdaptiveAvgPool1D(Layer): Examples: .. code-block:: python - :name: AdaptiveAvgPool1D-example + # average adaptive pool1d # suppose input data in shape of [N, C, L], `output_size` is m or [m], # output shape is [N, C, m], adaptive pool divide L dimension diff --git a/python/paddle/nn/layer/vision.py b/python/paddle/nn/layer/vision.py index 2fa150dcbdfbc..80363a94ec266 100644 --- a/python/paddle/nn/layer/vision.py +++ b/python/paddle/nn/layer/vision.py @@ -110,7 +110,6 @@ class PixelUnshuffle(Layer): Examples: .. code-block:: python - :name: PixelUnshuffle-example import paddle import paddle.nn as nn @@ -173,7 +172,6 @@ class ChannelShuffle(Layer): Examples: .. code-block:: python - :name: ChannelShuffle-example import paddle import paddle.nn as nn diff --git a/python/paddle/optimizer/lamb.py b/python/paddle/optimizer/lamb.py index 29233e6ced0a2..5a5f52bb3ef3d 100644 --- a/python/paddle/optimizer/lamb.py +++ b/python/paddle/optimizer/lamb.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ from ..fluid.layer_helper import LayerHelper from paddle import _C_ops from paddle.fluid.executor import global_scope +import paddle __all__ = [] @@ -266,6 +267,13 @@ def _append_optimize_op(self, block, param_and_grad): master_weight = None found_inf = self._get_auxiliary_var('found_inf') + if framework.in_dygraph_mode(): + _C_ops.final_state_lamb_(param_and_grad[0], param_and_grad[1], lr, + moment1, moment2, beta1_pow_acc, + beta2_pow_acc, master_weight, found_inf, + weight_decay, self._beta1, self._beta2, + self._epsilon, find_master) + return None if framework._non_static_mode(): _C_ops.lamb(param_and_grad[0], param_and_grad[1], lr, moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight, diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py index c277dcedb4d93..2df26020b8fc9 100644 --- a/python/paddle/profiler/profiler.py +++ b/python/paddle/profiler/profiler.py @@ -173,7 +173,6 @@ def export_chrome_tracing(dir_name: str, The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler ` . .. code-block:: python - :name: code-example1 # required: gpu import paddle.profiler as profiler @@ -224,7 +223,6 @@ def export_protobuf(dir_name: str, The return value can be used as parameter ``on_trace_ready`` in :ref:`Profiler ` . .. code-block:: python - :name: code-example1 # required: gpu import paddle.profiler as profiler diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py index a02311cc92985..c0146fe92763f 100644 --- a/python/paddle/profiler/utils.py +++ b/python/paddle/profiler/utils.py @@ -138,7 +138,6 @@ def load_profiler_result(filename: str): Examples: .. code-block:: python - :name: code-example1 # required: gpu import paddle.profiler as profiler diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py index 1950835151074..de9e48b3367cc 100644 --- a/python/paddle/static/io.py +++ b/python/paddle/static/io.py @@ -423,6 +423,25 @@ def save_to_file(path, content): content(bytes): Content to write. Returns: None + + Examples: + .. code-block:: python + + import paddle + paddle.enable_static() + path_prefix = "./infer_model" + # 用户自定义网络,此处用 softmax 回归为例。 + image = paddle.static.data(name='img', shape=[None, 28, 28], dtype='float32') + label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') + predict = paddle.static.nn.fc(image, 10, activation='softmax') + loss = paddle.nn.functional.cross_entropy(predict, label) + exe = paddle.static.Executor(paddle.CPUPlace()) + exe.run(paddle.static.default_startup_program()) + # 序列化参数 + serialized_params = paddle.static.serialize_persistables([image], [predict], exe) + # 将序列化之后的参数保存到文件 + params_path = path_prefix + ".params" + paddle.static.save_to_file(params_path, serialized_params) """ if not isinstance(content, bytes): @@ -450,8 +469,11 @@ def save_inference_model(path_prefix, feed_vars, fetch_vars, executor, executor(Executor): The executor that saves the inference model. You can refer to :ref:`api_guide_executor_en` for more details. kwargs: Supported keys including 'program' and "clip_extra". Attention please, kwargs is used for backward compatibility mainly. - - program(Program): specify a program if you don't want to use default main program. - - clip_extra(bool): set to True if you want to clip extra information for every operator. + + - program(Program): specify a program if you don't want to use default main program. + + - clip_extra(bool): set to True if you want to clip extra information for every operator. + Returns: None @@ -520,7 +542,9 @@ def save_inference_model(path_prefix, feed_vars, fetch_vars, executor, save_to_file(model_path, program_bytes) # serialize and save params params_bytes = _serialize_persistables(program, executor) - save_to_file(params_path, params_bytes) + # program may not contain any parameter and just compute operation + if params_bytes is not None: + save_to_file(params_path, params_bytes) @static_only @@ -638,6 +662,12 @@ def deserialize_persistables(program, data, executor): check_vars.append(var) load_var_map[var_copy.name] = var_copy + if data is None: + assert len( + origin_shape_map + ) == 0, "Required 'data' shall be not None if program contains parameter, but received 'data' is None." + return + # append load_combine op to load parameters, load_var_list = [] for name in sorted(load_var_map.keys()): @@ -675,6 +705,28 @@ def load_from_file(path): path(str): Path of an existed file. Returns: bytes: Content of file. + + Examples: + + .. code-block:: python + + import paddle + paddle.enable_static() + path_prefix = "./infer_model" + # 用户自定义网络,此处用 softmax 回归为例。 + image = paddle.static.data(name='img', shape=[None, 28, 28], dtype='float32') + label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') + predict = paddle.static.nn.fc(image, 10, activation='softmax') + loss = paddle.nn.functional.cross_entropy(predict, label) + exe = paddle.static.Executor(paddle.CPUPlace()) + exe.run(paddle.static.default_startup_program()) + # 序列化参数 + serialized_params = paddle.static.serialize_persistables([image], [predict], exe) + # 将序列化之后的参数保存到文件 + params_path = path_prefix + ".params" + paddle.static.save_to_file(params_path, serialized_params) + # 从文件加载序列化之后的参数 + serialized_params_copy = paddle.static.load_from_file(params_path) """ with open(path, 'rb') as f: data = f.read() @@ -805,7 +857,9 @@ def load_inference_model(path_prefix, executor, **kwargs): params_filename = os.path.basename(params_path) # load params data params_path = os.path.join(load_dirname, params_filename) - params_bytes = load_from_file(params_path) + params_bytes = None + if os.path.exists(params_path): + params_bytes = load_from_file(params_path) # deserialize bytes to program program = deserialize_program(program_bytes) diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index ef492620956c0..857175120fc59 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -178,7 +178,6 @@ def logspace(start, stop, num, base=10.0, dtype=None, name=None): Examples: .. code-block:: python - :name: logspace-example import paddle data = paddle.logspace(0, 10, 5, 2, 'float32') @@ -492,25 +491,24 @@ def ones(shape, dtype=None, name=None): Examples: .. code-block:: python - :name: ones-example - import paddle - - # default dtype for ones OP - data1 = paddle.ones(shape=[3, 2]) - # [[1. 1.] - # [1. 1.] - # [1. 1.]] - - data2 = paddle.ones(shape=[2, 2], dtype='int32') - # [[1 1] - # [1 1]] - - # shape is a Tensor - shape = paddle.full(shape=[2], dtype='int32', fill_value=2) - data3 = paddle.ones(shape=shape, dtype='int32') - # [[1 1] - # [1 1]] + import paddle + + # default dtype for ones OP + data1 = paddle.ones(shape=[3, 2]) + # [[1. 1.] + # [1. 1.] + # [1. 1.]] + + data2 = paddle.ones(shape=[2, 2], dtype='int32') + # [[1 1] + # [1 1]] + + # shape is a Tensor + shape = paddle.full(shape=[2], dtype='int32', fill_value=2) + data3 = paddle.ones(shape=shape, dtype='int32') + # [[1 1] + # [1 1]] """ if dtype is None: dtype = 'float32' @@ -713,30 +711,29 @@ def full(shape, fill_value, dtype=None, name=None): Examples: .. code-block:: python - :name: code-example1 - import paddle + import paddle - data1 = paddle.full(shape=[2,1], fill_value=0, dtype='int64') - #[[0] - # [0]] + data1 = paddle.full(shape=[2,1], fill_value=0, dtype='int64') + #[[0] + # [0]] - # attr shape is a list which contains Tensor. - positive_2 = paddle.full([1], 2, "int32") - data3 = paddle.full(shape=[1, positive_2], dtype='float32', fill_value=1.5) - # [[1.5 1.5]] + # attr shape is a list which contains Tensor. + positive_2 = paddle.full([1], 2, "int32") + data3 = paddle.full(shape=[1, positive_2], dtype='float32', fill_value=1.5) + # [[1.5 1.5]] - # attr shape is a Tensor. - shape = paddle.full([2], 2, "int32") - data4 = paddle.full(shape=shape, dtype='bool', fill_value=True) - # [[True True] - # [True True]] - - # attr fill_value is a Tensor. - val = paddle.full([1], 2.0, "float32") - data5 = paddle.full(shape=[2,1], fill_value=val, dtype='float32') - # [[2.0] - # [2.0]] + # attr shape is a Tensor. + shape = paddle.full([2], 2, "int32") + data4 = paddle.full(shape=shape, dtype='bool', fill_value=True) + # [[True True] + # [True True]] + + # attr fill_value is a Tensor. + val = paddle.full([1], 2.0, "float32") + data5 = paddle.full(shape=[2,1], fill_value=val, dtype='float32') + # [[2.0] + # [2.0]] """ if dtype is None: @@ -1110,57 +1107,59 @@ def diagflat(x, offset=0, name=None): Examples: .. code-block:: python + :name: code-example-1 - import paddle + import paddle + + x = paddle.to_tensor([1, 2, 3]) + y = paddle.diagflat(x) + print(y.numpy()) + # [[1 0 0] + # [0 2 0] + # [0 0 3]] + + y = paddle.diagflat(x, offset=1) + print(y.numpy()) + # [[0 1 0 0] + # [0 0 2 0] + # [0 0 0 3] + # [0 0 0 0]] + + y = paddle.diagflat(x, offset=-1) + print(y.numpy()) + # [[0 0 0 0] + # [1 0 0 0] + # [0 2 0 0] + # [0 0 3 0]] - x = paddle.to_tensor([1, 2, 3]) - y = paddle.diagflat(x) - print(y.numpy()) - # [[1 0 0] - # [0 2 0] - # [0 0 3]] - - y = paddle.diagflat(x, offset=1) - print(y.numpy()) - # [[0 1 0 0] - # [0 0 2 0] - # [0 0 0 3] - # [0 0 0 0]] - - y = paddle.diagflat(x, offset=-1) - print(y.numpy()) - # [[0 0 0 0] - # [1 0 0 0] - # [0 2 0 0] - # [0 0 3 0]] - .. code-block:: python + :name: code-example-2 - import paddle + import paddle - x = paddle.to_tensor([[1, 2], [3, 4]]) - y = paddle.diagflat(x) - print(y.numpy()) - # [[1 0 0 0] - # [0 2 0 0] - # [0 0 3 0] - # [0 0 0 4]] - - y = paddle.diagflat(x, offset=1) - print(y.numpy()) - # [[0 1 0 0 0] - # [0 0 2 0 0] - # [0 0 0 3 0] - # [0 0 0 0 4] - # [0 0 0 0 0]] - - y = paddle.diagflat(x, offset=-1) - print(y.numpy()) - # [[0 0 0 0 0] - # [1 0 0 0 0] - # [0 2 0 0 0] - # [0 0 3 0 0] - # [0 0 0 4 0]] + x = paddle.to_tensor([[1, 2], [3, 4]]) + y = paddle.diagflat(x) + print(y.numpy()) + # [[1 0 0 0] + # [0 2 0 0] + # [0 0 3 0] + # [0 0 0 4]] + + y = paddle.diagflat(x, offset=1) + print(y.numpy()) + # [[0 1 0 0 0] + # [0 0 2 0 0] + # [0 0 0 3 0] + # [0 0 0 0 4] + # [0 0 0 0 0]] + + y = paddle.diagflat(x, offset=-1) + print(y.numpy()) + # [[0 0 0 0 0] + # [1 0 0 0 0] + # [0 2 0 0 0] + # [0 0 3 0 0] + # [0 0 0 4 0]] """ padding_value = 0 if paddle.in_dynamic_mode(): @@ -1240,47 +1239,49 @@ def diag(x, offset=0, padding_value=0, name=None): Examples: .. code-block:: python + :name: code-example-1 - import paddle + import paddle - paddle.disable_static() - x = paddle.to_tensor([1, 2, 3]) - y = paddle.diag(x) - print(y.numpy()) - # [[1 0 0] - # [0 2 0] - # [0 0 3]] - - y = paddle.diag(x, offset=1) - print(y.numpy()) - # [[0 1 0 0] - # [0 0 2 0] - # [0 0 0 3] - # [0 0 0 0]] - - y = paddle.diag(x, padding_value=6) - print(y.numpy()) - # [[1 6 6] - # [6 2 6] - # [6 6 3]] + paddle.disable_static() + x = paddle.to_tensor([1, 2, 3]) + y = paddle.diag(x) + print(y.numpy()) + # [[1 0 0] + # [0 2 0] + # [0 0 3]] + + y = paddle.diag(x, offset=1) + print(y.numpy()) + # [[0 1 0 0] + # [0 0 2 0] + # [0 0 0 3] + # [0 0 0 0]] + + y = paddle.diag(x, padding_value=6) + print(y.numpy()) + # [[1 6 6] + # [6 2 6] + # [6 6 3]] .. code-block:: python + :name: code-example-2 - import paddle + import paddle - paddle.disable_static() - x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]]) - y = paddle.diag(x) - print(y.numpy()) - # [1 5] + paddle.disable_static() + x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]]) + y = paddle.diag(x) + print(y.numpy()) + # [1 5] - y = paddle.diag(x, offset=1) - print(y.numpy()) - # [2 6] + y = paddle.diag(x, offset=1) + print(y.numpy()) + # [2 6] - y = paddle.diag(x, offset=-1) - print(y.numpy()) - # [4] + y = paddle.diag(x, offset=-1) + print(y.numpy()) + # [4] """ if in_dygraph_mode(): return _C_ops.final_state_diag(x, offset, padding_value) @@ -1485,18 +1486,17 @@ def assign(x, output=None): Examples: .. code-block:: python - :name: assign-example - import paddle - import numpy as np - data = paddle.full(shape=[3, 2], fill_value=2.5, dtype='float64') # [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]] - array = np.array([[1, 1], - [3, 4], - [1, 3]]).astype(np.int64) - result1 = paddle.zeros(shape=[3, 3], dtype='float32') - paddle.assign(array, result1) # result1 = [[1, 1], [3 4], [1, 3]] - result2 = paddle.assign(data) # result2 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]] - result3 = paddle.assign(np.array([[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]], dtype='float32')) # result3 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]] + import paddle + import numpy as np + data = paddle.full(shape=[3, 2], fill_value=2.5, dtype='float64') # [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]] + array = np.array([[1, 1], + [3, 4], + [1, 3]]).astype(np.int64) + result1 = paddle.zeros(shape=[3, 3], dtype='float32') + paddle.assign(array, result1) # result1 = [[1, 1], [3 4], [1, 3]] + result2 = paddle.assign(data) # result2 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]] + result3 = paddle.assign(np.array([[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]], dtype='float32')) # result3 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]] """ input = x helper = LayerHelper('assign', **locals()) @@ -1628,7 +1628,8 @@ def clone(x, name=None): x (Tensor): The input Tensor. name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None. - Returns: A Tensor copied from ``input`` . + Returns: + Tensor, A Tensor copied from ``input``. Examples: .. code-block:: python @@ -1661,7 +1662,7 @@ def _memcpy(input, place=None, output=None): be created as :attr:`output`. Default: None. Returns: - Tensor: A tensor with the same shape, data type and value as :attr:`input`. + Tensor, A tensor with the same shape, data type and value as :attr:`input`. Examples: .. code-block:: python @@ -1777,7 +1778,6 @@ def tril_indices(row, col, offset=0, dtype='int64'): Examples: .. code-block:: python - :name: tril_indices-example import paddle diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 4fd393da6f1bb..80faf13c9b0d5 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -3267,8 +3267,7 @@ def corrcoef(x, rowvar=True, name=None): Examples: .. code-block:: python - :name: code-example1 - + import paddle xt = paddle.rand((3,4)) diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index ea7ef4ff9d724..05dc8035c5d59 100755 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -208,7 +208,7 @@ def slice(input, axes, starts, ends): if isinstance(item, tmp_tensor_type) else item for item in ends ] elif isinstance(ends, tmp_tensor_type): - etensor_t = ends.numpy() + tensor_t = ends.numpy() ends = [ele for ele in tensor_t] infer_flags = list(-1 for i in range(len(axes))) @@ -599,7 +599,7 @@ def crop(x, shape=None, offsets=None, name=None): Parameters: x (Tensor): 1-D to 6-D Tensor, the data type is float32, float64, int32 or int64. - shape (list|tuple|Tensor): The output shape is specified + shape (list|tuple|Tensor, optional): The output shape is specified by `shape`. Its data type is int32. If a list/tuple, it's length must be the same as the dimension size of `x`. If a Tensor, it should be a 1-D Tensor. When it is a list, each element can be an integer or a Tensor of shape: [1]. @@ -619,7 +619,6 @@ def crop(x, shape=None, offsets=None, name=None): Examples: .. code-block:: python - :name: code-example1 import paddle x = paddle.to_tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) @@ -778,8 +777,11 @@ def fill_(x, value): raise TypeError( "The type of 'value' must be int or float, but received %s." % (type(value))) - return _C_ops.fill_any_(x, "value_float", float(value), "value_int", - int(value)) + if in_dygraph_mode(): + return _C_ops.final_state_fill_(x, value) + else: + return _C_ops.fill_any_(x, "value_float", float(value), "value_int", + int(value)) @dygraph_only @@ -807,7 +809,10 @@ def zero_(x): print(tensor.tolist()) #[0, 0, 0, 0, 0] """ - return _C_ops.fill_any_(x, "value_float", 0., "value_int", int(0)) + if in_dygraph_mode(): + return _C_ops.final_state_fill_(x, 0.) + else: + return _C_ops.fill_any_(x, "value_float", 0., "value_int", int(0)) @dygraph_only @@ -888,10 +893,17 @@ def _fill_diagonal_tensor_impl(x, y, offset=0, dim1=0, dim2=1, inplace=False): y = y.reshape([1, -1]) if inplace: - return _C_ops.fill_diagonal_tensor_(x, y, 'dim1', dim1, 'dim2', dim2, - 'offset', offset) - return _C_ops.fill_diagonal_tensor(x, y, 'dim1', dim1, 'dim2', dim2, - 'offset', offset) + if in_dygraph_mode(): + return _C_ops.final_state_fill_diagonal_tensor_( + x, y, offset, dim1, dim2) + else: + return _C_ops.fill_diagonal_tensor_(x, y, 'offset', offset, 'dim1', + dim1, 'dim2', dim2) + if in_dygraph_mode(): + return _C_ops.final_state_fill_diagonal_tensor(x, y, offset, dim1, dim2) + else: + return _C_ops.fill_diagonal_tensor(x, y, 'offset', offset, 'dim1', dim1, + 'dim2', dim2) def fill_diagonal_tensor_(x, y, offset=0, dim1=0, dim2=1, name=None): @@ -1105,9 +1117,7 @@ def concat(x, axis=0, name=None): attrs = {} if isinstance(axis, Variable): axis.stop_gradient = True - inputs['AxisTensor'] = axis - else: - attrs['axis'] = axis + attrs['axis'] = axis helper.append_op(type='concat', inputs=inputs, @@ -1968,7 +1978,7 @@ def squeeze(x, axis=None, name=None): Examples: .. code-block:: python - :name: code-example1 + import paddle x = paddle.rand([5, 1, 10]) @@ -2180,7 +2190,7 @@ def unique(x, Examples: .. code-block:: python - :name: code-example1 + import paddle x = paddle.to_tensor([2, 3, 3, 1, 5, 3]) @@ -2925,13 +2935,11 @@ def get_attr_repeat_times(list_repeat_times): if isinstance(repeat_times, Variable): repeat_times.stop_gradient = True - inputs['RepeatTimes'] = repeat_times - attrs['repeat_times'] = [-1] + attrs['repeat_times'] = repeat_times elif isinstance(repeat_times, (list, tuple)): attrs['repeat_times'] = get_attr_repeat_times(repeat_times) if utils._contain_var(repeat_times): - inputs['repeat_times_tensor'] = utils._convert_to_tensor_list( - repeat_times) + attrs['repeat_times'] = utils._convert_to_tensor_list(repeat_times) dtype = helper.input_dtype(input_param_name='x') out = helper.create_variable_for_type_inference(dtype) @@ -3212,7 +3220,6 @@ def reshape(x, shape, name=None): Examples: .. code-block:: python - :name: code-example1 import paddle @@ -4178,7 +4185,6 @@ def take_along_axis(arr, indices, axis): Examples: .. code-block:: python - :name: code-example1 import paddle @@ -4244,7 +4250,6 @@ def put_along_axis(arr, indices, values, axis, reduce='assign'): Examples: .. code-block:: python - :name: code-example1 import paddle diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 6d365622746e3..b94329eb9213c 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -288,7 +288,6 @@ def multiplex(inputs, index, name=None): Examples: .. code-block:: python - :name: code-example1 import paddle @@ -1163,12 +1162,7 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None): axis = [axis] if not axis: - reduce_all_flag = True - else: - if len(axis) == len(x.shape): - reduce_all_flag = True - else: - reduce_all_flag = False + axis = [] dtype_flag = False if dtype is not None: @@ -1176,13 +1170,16 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None): dtype = convert_np_dtype_to_dtype_(dtype) if in_dygraph_mode(): - if reduce_all_flag: - axis = range(len(x.shape)) - else: - axis = axis if axis != None and axis != [] else [0] - return _C_ops.final_state_sum(x, axis, dtype, keepdim) + if len(axis) == 0: + reduce_all_flag = True + else: + if len(axis) == len(x.shape): + reduce_all_flag = True + else: + reduce_all_flag = False + if _in_legacy_dygraph(): axis = axis if axis != None and axis != [] else [0] if dtype_flag: @@ -1376,7 +1373,6 @@ def count_nonzero(x, axis=None, keepdim=False, name=None): Examples: .. code-block:: python - :name: count_nonzero-example import paddle # x is a 2-D Tensor: @@ -1468,7 +1464,7 @@ def add_n(inputs, name=None): Examples: .. code-block:: python - :name: code-example1 + import paddle input0 = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], dtype='float32') @@ -2058,6 +2054,24 @@ def _check_input(x): type='inverse', inputs={'Input': [x] }, outputs={'Output': [out]}) return out +def _get_reduce_axis(axis): + """ + Internal function for max, min, amax and amin. + It computes the attribute reduce_all value based on axis. + """ + if axis is not None and not isinstance(axis, list): + if isinstance(axis, tuple): + axis = list(axis) + elif isinstance(axis, int): + axis= [axis] + else: + raise TypeError( + "The type of axis must be int, list or tuple, but received {}".format(type(axis))) + reduce_all = True if axis == None or axis == [] else False + if axis == None: + axis = [] + return reduce_all, axis + def _get_reduce_all_value(axis): """ Internal function for max, min, amax and amin. @@ -2154,10 +2168,8 @@ def max(x, axis=None, keepdim=False, name=None): #[7., 8.], [[[0., 0.], [0., 0.]], [[0., 0.], [1., 1.]]] """ - reduce_all, axis = _get_reduce_all_value(axis) + reduce_all, axis = _get_reduce_axis(axis) if in_dygraph_mode(): - if reduce_all: - axis = range(len(x.shape)) return _C_ops.final_state_max(x, axis, keepdim) if _in_legacy_dygraph(): return _C_ops.reduce_max(x, 'dim', axis, 'keep_dim', keepdim, @@ -2257,10 +2269,8 @@ def min(x, axis=None, keepdim=False, name=None): #[1., 2.], [[[1., 1.], [0., 0.]], [[0., 0.], [0., 0.]]] """ - reduce_all, axis = _get_reduce_all_value(axis) + reduce_all, axis = _get_reduce_axis(axis) if in_dygraph_mode(): - if reduce_all: - axis = range(len(x.shape)) return _C_ops.final_state_min(x, axis, keepdim) if _in_legacy_dygraph(): @@ -2374,10 +2384,8 @@ def amax(x, axis=None, keepdim=False, name=None): #[0.9., 0.9], [[[0., 0.3333], [0.5, 0.3333]], [[0.5, 0.3333], [1., 1.]]] """ - reduce_all, axis = _get_reduce_all_value(axis) + reduce_all, axis = _get_reduce_axis(axis) if in_dygraph_mode(): - if reduce_all: - axis = range(len(x.shape)) return _C_ops.final_state_amax(x, axis, keepdim) if _in_legacy_dygraph(): return _C_ops.reduce_amax(x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all) @@ -2490,10 +2498,8 @@ def amin(x, axis=None, keepdim=False, name=None): #[0.1., 0.1], [[[0., 0.3333], [0.5, 0.3333]], [[0.5, 0.3333], [1., 1.]]] """ - reduce_all, axis = _get_reduce_all_value(axis) + reduce_all, axis = _get_reduce_axis( axis ) if in_dygraph_mode(): - if reduce_all: - axis = range(len(x.shape)) return _C_ops.final_state_amin(x, axis, keepdim) elif _in_legacy_dygraph(): return _C_ops.reduce_amin(x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all) @@ -4629,7 +4635,6 @@ def heaviside(x, y, name=None): Examples: .. code-block:: python - :name: heaviside-example import paddle x = paddle.to_tensor([-0.5, 0, 0.5]) @@ -4662,7 +4667,7 @@ def frac(x, name=None): Tensor: The output Tensor of frac. Examples: - .. code-block:: Python + .. code-block:: python import paddle import numpy as np diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index 663c2ccb91855..df93b30b7c224 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -47,7 +47,6 @@ def bernoulli(x, name=None): Examples: .. code-block:: python - :name: bernoulli-example import paddle diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py index 1fe26f0ae48be..0324766d3ec43 100644 --- a/python/paddle/tensor/search.py +++ b/python/paddle/tensor/search.py @@ -225,7 +225,7 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None): Examples: .. code-block:: python - :name: code-example1 + import paddle x = paddle.to_tensor([[5,8,9,5], @@ -447,7 +447,6 @@ def sort(x, axis=-1, descending=False, name=None): Examples: .. code-block:: python - :name: code-example1 import paddle @@ -849,7 +848,7 @@ def topk(x, k, axis=None, largest=True, sorted=True, name=None): Examples: .. code-block:: python - :name: code-example1 + import paddle data_1 = paddle.to_tensor([1, 4, 5, 7]) diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py index ed6cf6fd593ec..f0bc955734d61 100644 --- a/python/paddle/tensor/stat.py +++ b/python/paddle/tensor/stat.py @@ -272,7 +272,6 @@ def nanmedian(x, axis=None, keepdim=True, name=None): Examples: .. code-block:: python - :name: nanmedian-example import paddle x = paddle.to_tensor([[float('nan'), 2. , 3. ], [0. , 1. , 2. ]]) diff --git a/python/paddle/tests/test_dlpack.py b/python/paddle/tests/test_dlpack.py index 5ca49a09fe87b..076fe5545db1c 100644 --- a/python/paddle/tests/test_dlpack.py +++ b/python/paddle/tests/test_dlpack.py @@ -33,9 +33,8 @@ def func_test_dlpack_dygraph(self): isinstance(out_from_dlpack, paddle.fluid.core.eager.Tensor)) else: self.assertTrue(isinstance(out_from_dlpack, paddle.Tensor)) - self.assertTrue( - np.array_equal(np.array(out_from_dlpack), - np.array([1, 2, 3, 4]).astype('int'))) + np.testing.assert_array_equal(np.array(out_from_dlpack), + np.array([1, 2, 3, 4]).astype('int')) def test_dlpack_dygraph(self): with _test_eager_guard(): @@ -64,9 +63,9 @@ def test_dlpack_static(self): dlpack = paddle.utils.dlpack.to_dlpack(tensor) out_from_dlpack = paddle.utils.dlpack.from_dlpack(dlpack) self.assertTrue(isinstance(out_from_dlpack, fluid.core.Tensor)) - self.assertTrue( - np.array_equal(np.array(out_from_dlpack), - np.array([[1], [2], [3], [4]]).astype('int'))) + np.testing.assert_array_equal( + np.array(out_from_dlpack), + np.array([[1], [2], [3], [4]]).astype('int')) # when build with cuda if core.is_compiled_with_cuda(): @@ -76,9 +75,9 @@ def test_dlpack_static(self): gdlpack = paddle.utils.dlpack.to_dlpack(gtensor) gout_from_dlpack = paddle.utils.dlpack.from_dlpack(gdlpack) self.assertTrue(isinstance(gout_from_dlpack, fluid.core.Tensor)) - self.assertTrue( - np.array_equal(np.array(gout_from_dlpack), - np.array([[1], [2], [3], [4]]).astype('int'))) + np.testing.assert_array_equal( + np.array(gout_from_dlpack), + np.array([[1], [2], [3], [4]]).astype('int')) def func_test_dlpack_dtype_conversion(self): paddle.disable_static() diff --git a/python/paddle/tests/test_hapi_amp.py b/python/paddle/tests/test_hapi_amp.py index 24df22ab5ea54..900a8a0fcc25d 100644 --- a/python/paddle/tests/test_hapi_amp.py +++ b/python/paddle/tests/test_hapi_amp.py @@ -127,12 +127,9 @@ def test_save_load(self): model._scaler.state_dict()['incr_count']) self.assertEqual(new_model._scaler.state_dict()['decr_count'], model._scaler.state_dict()['decr_count']) - self.assertTrue( - np.array_equal( - new_model._optimizer.state_dict() - ['conv2d_1.w_0_moment1_0'].numpy(), - model._optimizer.state_dict() - ['conv2d_1.w_0_moment1_0'].numpy())) + np.testing.assert_array_equal( + new_model._optimizer.state_dict()['conv2d_1.w_0_moment1_0'].numpy(), + model._optimizer.state_dict()['conv2d_1.w_0_moment1_0'].numpy()) def test_dynamic_check_input(self): paddle.disable_static() diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py index 73642e1a004ca..0367e9ed3e354 100644 --- a/python/paddle/utils/cpp_extension/cpp_extension.py +++ b/python/paddle/utils/cpp_extension/cpp_extension.py @@ -139,7 +139,8 @@ def setup(**attr): compiler using dict type with ``{'cxx': [...], 'nvcc': [...]}`` . Default is None. **attr(dict, optional): Specify other arguments same as ``setuptools.setup`` . - Returns: None + Returns: + None """ cmdclass = attr.get('cmdclass', {}) diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 484fcf95cb269..cf038d18ae3a4 100755 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -1075,8 +1075,7 @@ def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None): Examples: .. code-block:: python - :name: code-example1 - + import paddle x = paddle.uniform([2, 490, 28, 28], dtype='float32') boxes = paddle.to_tensor([[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], dtype='float32') @@ -1144,7 +1143,7 @@ class PSRoIPool(Layer): Examples: .. code-block:: python - :name: code-example1 + import paddle psroi_module = paddle.vision.ops.PSRoIPool(7, 1.0) @@ -1350,7 +1349,7 @@ def roi_align(x, Examples: .. code-block:: python - :name: code-example1 + import paddle from paddle.vision.ops import roi_align @@ -1426,7 +1425,7 @@ class RoIAlign(Layer): Examples: .. code-block:: python - :name: code-example1 + import paddle from paddle.vision.ops import RoIAlign @@ -1740,7 +1739,15 @@ def generate_proposals(scores, print(rois, roi_probs, roi_nums) """ - if _non_static_mode(): + if in_dygraph_mode(): + assert return_rois_num, "return_rois_num should be True in dygraph mode." + attrs = (pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta, + pixel_offset) + rpn_rois, rpn_roi_probs, rpn_rois_num = _C_ops.final_state_generate_proposals_v2( + scores, bbox_deltas, img_size, anchors, variances, *attrs) + + return rpn_rois, rpn_roi_probs, rpn_rois_num + elif _non_static_mode(): assert return_rois_num, "return_rois_num should be True in dygraph mode." attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n, 'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta, diff --git a/python/setup.py.in b/python/setup.py.in index 1b36b272d0d70..55129c47c220b 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -400,6 +400,8 @@ packages=['paddle', 'paddle.device.cuda', 'paddle.version', 'paddle.profiler', + 'paddle.geometric', + 'paddle.geometric.message_passing', ] with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: @@ -618,15 +620,19 @@ headers = ( list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/backends', recursive=True)) + # phi backends headers list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) + # phi core headers list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/infermeta', recursive=True)) + # phi infermeta headers - list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/kernels', recursive=True)) + # phi kernels headers + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/kernels')) + # phi kernels headers + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/kernels/sparse')) + # phi sparse kernels headers + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/kernels/selected_rows')) + # phi selected_rows kernels headers + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/kernels/strings')) + # phi sparse kernels headers + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/kernels/primitive')) + # phi kernel primitive api headers # capi headers list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/capi', recursive=True)) + # phi capi headers # utils api headers list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True))) # paddle utils headers -jit_layer_headers = ['layer.h', 'serializer.h', 'serializer_utils.h', 'all.h', 'base_function.h'] +jit_layer_headers = ['layer.h', 'serializer.h', 'serializer_utils.h', 'all.h', 'function.h'] for f in jit_layer_headers: - headers += list(find_files(f, '@PADDLE_SOURCE_DIR@/paddle/fluid/jit', recursive=False)) + headers += list(find_files(f, '@PADDLE_SOURCE_DIR@/paddle/fluid/jit', recursive=True)) if '${WITH_MKLDNN}' == 'ON': headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh index 49d614fa99107..1a454f8289091 100644 --- a/tools/check_api_approvals.sh +++ b/tools/check_api_approvals.sh @@ -64,9 +64,9 @@ fi api_src_spec_diff=`python ${PADDLE_ROOT}/tools/check_api_source_without_core_ops.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.source.md5 ${PADDLE_ROOT}/paddle/fluid/API_PR.source.md5` if [ "$api_src_spec_diff" != "" ]; then echo_line="APIs without core.ops: \n${api_src_spec_diff}\n" - echo_line="${echo_line}You must have one RD (zhiqiu (Recommend) or phlrain) approval for the api change for the opreator-related api without '_C_ops'.\n" + echo_line="${echo_line}You must have one RD (JiabinYang (Recommend) or wanghuancoder, phlrain) approval for the api change for the opreator-related api without '_C_ops'.\n" echo_line="${echo_line}For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/paddle_api_development_manual.md]\n" - check_approval 1 6888866 43953930 + check_approval 1 JiabinYang wanghuancoder phlrain fi op_type_spec_diff=`python ${PADDLE_ROOT}/tools/check_op_register_type.py ${PADDLE_ROOT}/paddle/fluid/OP_TYPE_DEV.spec ${PADDLE_ROOT}/paddle/fluid/OP_TYPE_PR.spec` diff --git a/tools/coverage/gcda_clean.py b/tools/coverage/gcda_clean.py index 062b8f356d667..286f50eae9ab6 100644 --- a/tools/coverage/gcda_clean.py +++ b/tools/coverage/gcda_clean.py @@ -87,12 +87,15 @@ def clean(pull_id): # convert paddle/fluid/imperative/CMakeFiles/layer.dir/layer.cc.gcda # to paddle/fluid/imperative/layer.cc.gcda - - if trimmed.endswith('.dir'): - trimmed = os.path.dirname(trimmed) - - if trimmed.endswith('CMakeFiles'): - trimmed = os.path.dirname(trimmed) + # modifed to make it more robust + # covert /paddle/build/paddle/phi/backends/CMakeFiles/phi_backends.dir/gpu/cuda/cuda_info.cc.gcda + # to /paddle/build/paddle/phi/backends/gpu/cuda/cuda_info.cc.gcda + trimmed_tmp = [] + for p in trimmed.split('/'): + if p.endswith('.dir') or p.endswith('CMakeFiles'): + continue + trimmed_tmp.append(p) + trimmed = '/'.join(trimmed_tmp) # remove no changed gcda diff --git a/tools/dockerfile/Dockerfile.ipu b/tools/dockerfile/Dockerfile.ipu index 2a3bfd332ba8d..b79a113332a3e 100644 --- a/tools/dockerfile/Dockerfile.ipu +++ b/tools/dockerfile/Dockerfile.ipu @@ -25,10 +25,16 @@ RUN apt-get update && apt-get install -y curl wget vim git unzip unrar tar xz-ut openssl libffi-dev pciutils libblas-dev gfortran libblas3 liblapack-dev liblapack3 default-jre screen tmux gdb lldb gcc g++ RUN apt-get update && apt-get install -y rdma-core librdmacm1 -# install g++-8 -RUN apt install g++-8 gcc-8 -y -RUN ln -sf /usr/bin/gcc-8 /usr/bin/gcc -RUN ln -sf /usr/bin/g++-8 /usr/bin/g++ +# Downgrade gcc&&g++ +WORKDIR /usr/bin +COPY tools/dockerfile/build_scripts /build_scripts +RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts +RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ +RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc +RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ +RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc +RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ +ENV PATH=/usr/local/gcc-8.2/bin:$PATH # install cmake WORKDIR /home From 3f49817a453ce834c17920302d8432350e2183b3 Mon Sep 17 00:00:00 2001 From: yeliang2258 <30516196+yeliang2258@users.noreply.github.com> Date: Wed, 10 Aug 2022 19:00:54 +0800 Subject: [PATCH 3/4] fix mkldnn interpolate ops (#45008) --- paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc index c3710342d805f..64d7bca4d0646 100644 --- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc @@ -105,7 +105,8 @@ class InterpolateMKLDNNKernel : public framework::OpKernel { } } } - if (scale[0] > 0.0f && scale[1] > 0.0f && scale[2] > 0.0f) { + if (scale.size() == 3 && scale[0] > 0.0f && scale[1] > 0.0f && + scale[2] > 0.0f) { int j = 0; std::vector in_dhw_vec = phi::vectorize(in_dhw_dims); std::transform( From 4805da503a9db4c12f4361287f5f9b5594906798 Mon Sep 17 00:00:00 2001 From: Wangzheee <634486483@qq.com> Date: Wed, 10 Aug 2022 19:13:30 +0800 Subject: [PATCH 4/4] [Paddle Inference]Disable skip layernorm half (#45047) * disable_skip_layernorm_fp16 --- paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc index 25a6861eb67c2..26675fdddbac2 100644 --- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc @@ -163,8 +163,10 @@ class SkipLayerNormOpConverter : public OpConverter { auto scale_weight = GetFp32Weight("Scale").get(); float eps = PADDLE_GET_CONST(float, op_desc.GetAttr("epsilon")); - bool with_fp16 = - engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + // bool with_fp16 = + // engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + bool with_fp16 = false; + plugin::SkipLayerNormPluginDynamic* plugin = new plugin::SkipLayerNormPluginDynamic( static_cast(bias_weight.values),