diff --git a/CMakeLists.txt b/CMakeLists.txt index b1554fba5e1fa..fa87cc14f2668 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -131,6 +131,7 @@ option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF) option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON) option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON) option(WITH_ARM "Compile PaddlePaddle with arm support" OFF) +option(WITH_MUSL "Compile with musl libc instead of gblic" OFF) # PY_VERSION if(NOT PY_VERSION) diff --git a/README.md b/README.md index d14d0ef001481..580ebca8ef308 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ pip install paddlepaddle # Linux GPU cuda10cudnn7 pip install paddlepaddle-gpu # Linux GPU cuda9cudnn7 -pip install paddlepaddle-gpu==1.8.4.post97 +pip install paddlepaddle-gpu==1.8.5.post97 ``` It is recommended to read [this doc](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/install/index_en.html) on our website. diff --git a/README_cn.md b/README_cn.md index e4544a3eff6e5..ee8cfbef1cef9 100644 --- a/README_cn.md +++ b/README_cn.md @@ -30,7 +30,7 @@ pip install paddlepaddle # Linux GPU cuda10cudnn7 pip install paddlepaddle-gpu # Linux GPU cuda9cudnn7 -pip install paddlepaddle-gpu==1.8.4.post97 +pip install paddlepaddle-gpu==1.8.5.post97 ``` 更多安装信息详见官网 [安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.8/beginners_guide/install/index_cn.html) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index cf458d9770675..fc984f5e560ef 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -51,6 +51,16 @@ if(WIN32) endif(NOT MSVC) endif(WIN32) +if(WITH_MUSL) + add_definitions(-DPADDLE_WITH_MUSL) + + message(STATUS, "Set compile option WITH_MKL=OFF when WITH_MUSL=ON") + SET(WITH_MKL OFF) + + message(STATUS, "Set compile option WITH_GPU=OFF when WITH_MUSL=ON") + SET(WITH_GPU OFF) +endif() + if(WITH_PSLIB) add_definitions(-DPADDLE_WITH_PSLIB) endif() diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake index 3da550519bae2..1da47bba7b6a5 100644 --- a/cmake/external/lite.cmake +++ b/cmake/external/lite.cmake @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -if(NOT LINUX OR NOT WITH_MKL) - message("Paddle-lite will not build because the required Linux and MKL do not exist.") +if(NOT LINUX) + message("Paddle-lite will not build because the required Linux do not exist.") set(WITH_LITE OFF) return() endif() @@ -42,30 +42,30 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) endif() # No quotes, so cmake can resolve it as a command with arguments. - set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j) - set(LITE_OPTIONAL_ARGS -DWITH_MKL=ON - -DLITE_WITH_CUDA=${WITH_GPU} - -DWITH_MKLDNN=OFF - -DLITE_WITH_X86=ON - -DLITE_WITH_PROFILE=OFF - -DWITH_LITE=OFF - -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF - -DWITH_PYTHON=OFF - -DWITH_TESTING=OFF - -DLITE_BUILD_EXTRA=ON - -DCUDNN_ROOT=${CUDNN_ROOT} - -DLITE_WITH_STATIC_CUDA=OFF - -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME} - -DLITE_WITH_XPU=${LITE_WITH_XPU} - -DXPU_SDK_ROOT=${XPU_SDK_ROOT} - -DLITE_WITH_ARM=OFF) - - ExternalProject_Add( + if(WITH_ARM) + set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j) + message(WARNING "BUILD_COMMAND: ${LITE_BUILD_COMMAND}") + set(LITE_OPTIONAL_ARGS -DWITH_MKL=OFF + -DLITE_WITH_CUDA=OFF + -DWITH_MKLDNN=OFF + -DLITE_WITH_X86=OFF + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON + -DLITE_WITH_PROFILE=OFF + -DARM_TARGET_OS=armlinux + -DWITH_LITE=ON + -DWITH_PYTHON=OFF + -DWITH_TESTING=OFF + -DLITE_BUILD_EXTRA=ON + -DLITE_WITH_XPU=${LITE_WITH_XPU} + -DXPU_SDK_ROOT=${XPU_SDK_ROOT} + -DLITE_WITH_ARM=ON) + ExternalProject_Add( ${LITE_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/PaddlePaddle/Paddle-Lite.git" GIT_TAG ${LITE_GIT_TAG} PREFIX ${LITE_SOURCES_DIR} + PATCH_COMMAND mkdir -p ${LITE_SOURCES_DIR}/src/extern_lite-build/lite/gen_code && touch ${LITE_SOURCES_DIR}/src/extern_lite-build/lite/gen_code/__generated_code__.cc UPDATE_COMMAND "" BUILD_COMMAND ${LITE_BUILD_COMMAND} INSTALL_COMMAND "" @@ -81,7 +81,51 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} ${LITE_OPTIONAL_ARGS} - ) + ) + set(LITE_OUTPUT_BIN_DIR inference_lite_lib.armlinux.armv8) + else() + set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j) + set(LITE_OUTPUT_BIN_DIR inference_lite_lib) + set(LITE_OPTIONAL_ARGS -DWITH_MKL=ON + -DLITE_WITH_CUDA=${WITH_GPU} + -DWITH_MKLDNN=OFF + -DLITE_WITH_X86=ON + -DLITE_WITH_PROFILE=OFF + -DWITH_LITE=OFF + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF + -DWITH_PYTHON=OFF + -DWITH_TESTING=OFF + -DLITE_BUILD_EXTRA=ON + -DCUDNN_ROOT=${CUDNN_ROOT} + -DLITE_WITH_STATIC_CUDA=OFF + -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME} + -DLITE_WITH_XPU=${LITE_WITH_XPU} + -DXPU_SDK_ROOT=${XPU_SDK_ROOT} + -DLITE_WITH_ARM=OFF) + + ExternalProject_Add( + ${LITE_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/PaddlePaddle/Paddle-Lite.git" + GIT_TAG ${LITE_GIT_TAG} + PREFIX ${LITE_SOURCES_DIR} + UPDATE_COMMAND "" + BUILD_COMMAND ${LITE_BUILD_COMMAND} + INSTALL_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + ${LITE_OPTIONAL_ARGS} + ) + endif() ExternalProject_Get_property(${LITE_PROJECT} BINARY_DIR) ExternalProject_Get_property(${LITE_PROJECT} SOURCE_DIR) set(LITE_BINARY_DIR ${BINARY_DIR}) @@ -103,8 +147,8 @@ function(external_lite_libs alias path) endif() endfunction() -external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so) -set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so) +external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so) +set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so) add_definitions(-DPADDLE_WITH_LITE) add_definitions(-DLITE_WITH_LOG) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index c0adda0da31ae..e3ac8624a809a 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn) SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) SET(MKLDNN_REPOSITORY https://github.com/oneapi-src/oneDNN.git) -SET(MKLDNN_TAG 64a48f9565aa72f6359917b3406328075a409939) +SET(MKLDNN_TAG 361725600224f41b7347a1c6bee9b04d1e6c14d7) # Introduce variables: # * CMAKE_INSTALL_LIBDIR diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index f4603051a0e7e..d5ef6d85b578f 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -131,7 +131,7 @@ function(copy_part_of_thrid_party TARGET DST) if (LITE_BINARY_DIR) set(dst_dir "${DST}/third_party/install/lite") copy(${TARGET} - SRCS ${LITE_BINARY_DIR}/inference_lite_lib/* + SRCS ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/* DSTS ${dst_dir}) endif() endfunction() diff --git a/cmake/init.cmake b/cmake/init.cmake index 902dfb11fc0af..5f36a9adf1ae6 100644 --- a/cmake/init.cmake +++ b/cmake/init.cmake @@ -28,5 +28,6 @@ endif() if(WIN32) set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props) + set(CMAKE_CXX_FLAGS_RELEASE "-O3 -Os -DNDEBUG") endif() diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index bb5e2e1369a84..d31943289d7a1 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -123,7 +123,9 @@ cc_library(attribute SRCS attribute.cc DEPS framework_proto boost enforce) cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc device_context) -cc_library(op_version_registry SRCS op_version_registry.cc DEPS framework_proto boost) +cc_library(op_version_proto SRCS op_version_proto.cc DEPS framework_proto boost) + +cc_library(op_version_registry SRCS op_version_registry.cc DEPS op_version_proto framework_proto boost) cc_test(op_version_registry_test SRCS op_version_registry_test.cc DEPS op_version_registry) cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog) diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 108cd9ac6d1c0..8563b5b6d3695 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -203,7 +203,7 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, // As MKL-DNN description was in NCHW and paddle is expecting NHWC platform::MatchShapeToLayout(out, in_layout, out_layout); - out->set_layout(out_layout); + out->set_layout(DataLayout::kNCHW); // reset format since the out tensor will be feed to non-MKLDNN OPkernel out->set_format(MKLDNNMemoryFormat::undef); } diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index 3a40de6988f29..70693a5df2609 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -117,6 +117,9 @@ void SetTensorToVariable(const Variable &in_var, const Tensor &tensor, auto *tran_lod_tensor = out_var->GetMutable(); tran_lod_tensor->set_lod(in_lod_tensor.lod()); tran_lod_tensor->set_layout(in_lod_tensor.layout()); +#ifdef PADDLE_WITH_MKLDNN + tran_lod_tensor->set_format(in_lod_tensor.format()); +#endif tran_lod_tensor->ShareDataWith(tensor); } else if (in_var.IsType()) { auto &in_selected_rows = in_var.Get(); diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 21e28d7ac86d0..881ef30ffe690 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -98,6 +98,7 @@ message AsyncConfig { optional int32 send_wait_times = 7 [ default = 1 ]; optional bool runtime_split_send_recv = 8 [ default = false ]; optional bool launch_barrier = 9 [ default = true ]; + optional string heter_worker_device_guard = 10 [ default = 'cpu' ]; } message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; } diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index 29312370b3448..c33d71b3b0a9c 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -179,29 +179,15 @@ message BlockDesc { optional int32 forward_block_idx = 5 [ default = -1 ]; } -// CompatibleInfo is used to determine if a feature is compatible and -// provides the information. -message CompatibleInfo { - enum Type { - COMPATIBLE = 0; - DEFINITELY_NOT = 1; - POSSIBLE = 2; - BUG_FIX = 3; - PRECISION_CHANGE = 4; - } - required string version = 1; - required Type type = 2; -} - -// In some cases, Paddle Fluid may perform operator definition iterations, -// and the operator uses OpCompatibleMap for compatibility testing. -message OpCompatibleMap { - message OpCompatiblePair { +// In some cases, Paddle may perform operator definition iterations, +// and the operator uses OpVersionMap for compatibility testing. +message OpVersion { required int32 version = 1; } +message OpVersionMap { + message OpVersionPair { required string op_name = 1; - required CompatibleInfo compatible_info = 2; + required OpVersion op_version = 2; } - repeated OpCompatiblePair pair = 1; - optional string default_required_version = 2; + repeated OpVersionPair pair = 1; } // Please refer to @@ -210,8 +196,8 @@ message OpCompatibleMap { // TODO(panyx0718): A model can have multiple programs. Need a // way to distinguish them. Maybe ID or name? message ProgramDesc { - reserved 2; // For backward compatibility. + reserved 2, 3; // For backward compatibility. repeated BlockDesc blocks = 1; optional Version version = 4; - optional OpCompatibleMap op_compatible_map = 3; + optional OpVersionMap op_version_map = 5; } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 96952e20c2158..ed2863e8bf798 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1882,9 +1882,9 @@ PDNode *patterns::MultipleQuantize::operator()() { PDNode *patterns::QuantizePlacement::operator()( const std::unordered_set &quantize_enabled_op_types) { std::unordered_set supported_op_types = - std::unordered_set({"concat", "conv2d", "elementwise_add", - "fc", "matmul", "pool2d", "prior_box", - "relu", "reshape2", "transpose2"}); + std::unordered_set( + {"concat", "conv2d", "elementwise_add", "fc", "matmul", "pool2d", + "prior_box", "relu", "reshape2", "transpose2", "fusion_gru"}); if (!quantize_enabled_op_types.empty()) { supported_op_types = quantize_enabled_op_types; } @@ -1894,7 +1894,8 @@ PDNode *patterns::QuantizePlacement::operator()( PDNode *patterns::Bfloat16Placement::operator()( const std::unordered_set &bfloat16_enabled_op_types) { - std::unordered_set supported_op_types{"conv2d"}; + std::unordered_set supported_op_types = + std::unordered_set({"conv2d", "fusion_gru"}); if (!bfloat16_enabled_op_types.empty()) { supported_op_types = bfloat16_enabled_op_types; } @@ -2280,6 +2281,23 @@ PDNode *patterns::MatmulTransposeReshapePattern::operator()() { return reshape_out; } +PDNode *patterns::FusionGru::operator()() { + auto op = pattern->NewNode(op_repr())->assert_is_op("fusion_gru"); + auto x = pattern->NewNode(x_repr())->AsInput()->assert_is_op_input( + "fusion_gru", "X"); + auto weight_h = pattern->NewNode(weight_h_repr()) + ->AsInput() + ->assert_is_op_input("fusion_gru", "WeightH"); + auto weight_x = pattern->NewNode(weight_x_repr()) + ->AsInput() + ->assert_is_op_input("fusion_gru", "WeightX"); + auto out = pattern->NewNode(out_repr()) + ->AsOutput() + ->assert_is_op_output("fusion_gru", "Hidden"); + op->LinksFrom({x, weight_h, weight_x}).LinksTo({out}); + return out; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 7116b8a2a6f35..15f6ea1541d58 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -1312,6 +1312,21 @@ struct MatmulTransposeReshapePattern : public PatternBase { PATTERN_DECL_NODE(reshape_out_xshape); }; +// fusion_gru op +// Forward pass for fusion_gru. +// fusion_gru out is a result of the operator. +struct FusionGru : public PatternBase { + FusionGru(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "fusion_gru") {} + + PDNode* operator()(); + PATTERN_DECL_NODE(op); + PATTERN_DECL_NODE(x); + PATTERN_DECL_NODE(weight_h); + PATTERN_DECL_NODE(weight_x); + PATTERN_DECL_NODE(out); +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index 0254b5e757351..58931f3ed3872 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -63,8 +63,9 @@ enum { U8_MAX = 255, S8_MAX = 127 }; void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name, double scale_to_one, - bool is_unsigned, - std::string scale_attr_name) const { + bool is_input_unsigned, + std::string scale_attr_name, float shift, + std::string shift_attr_name) const { auto inputs = op->Op()->InputNames(); bool name_found = std::find(inputs.begin(), inputs.end(), input_name) != inputs.end(); @@ -72,7 +73,7 @@ void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input, platform::errors::InvalidArgument( "Var(%s) isn't the input of the %s operator.", input_name, op->Op()->Type())); - unsigned max = is_unsigned ? U8_MAX : S8_MAX; + unsigned max = is_input_unsigned ? U8_MAX : S8_MAX; float scale = scale_to_one * max; // Create quantize output variable @@ -86,7 +87,8 @@ void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input, q_desc.SetOutput("Output", std::vector({quantize_out_node->Name()})); q_desc.SetAttr("Scale", scale); - q_desc.SetAttr("is_negative_input", !is_unsigned); + q_desc.SetAttr("Shift", shift); + q_desc.SetAttr("is_negative_input", !is_input_unsigned); q_desc.SetAttr("output_format", Has("data_layout") ? Get("data_layout") : "NHWC"); @@ -103,11 +105,13 @@ void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input, IR_NODE_LINK_TO(quantize_out_node, op); if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale); + if (!shift_attr_name.empty()) op->Op()->SetAttr(shift_attr_name, shift); } void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name, - bool are_unsigned, - std::string scale_attr_name) const { + bool are_inputs_unsigned, + std::string scale_attr_name, float shift, + std::string shift_attr_name) const { auto inputs = op->inputs; auto output = op->outputs[0]; PADDLE_ENFORCE_GE(inputs.size(), 1, @@ -127,7 +131,7 @@ void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name, std::vector quantize_out_node_names(inputs.size()); double scale_out = GetScaleValueForNode(output); - unsigned max = are_unsigned ? U8_MAX : S8_MAX; + unsigned max = are_inputs_unsigned ? U8_MAX : S8_MAX; float scale = scale_out * max; for (size_t i = 0; i < inputs.size(); i++) { @@ -137,10 +141,11 @@ void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name, quantize_out_node_names[i] = quantize_out_nodes[i]->Name(); q_desc.SetAttr("Scale", scale); + q_desc.SetAttr("Shift", shift); q_desc.SetInput("Input", std::vector({inputs[i]->Name()})); q_desc.SetOutput("Output", std::vector({quantize_out_node_names[i]})); - q_desc.SetAttr("is_negative_input", !are_unsigned); + q_desc.SetAttr("is_negative_input", !are_inputs_unsigned); auto quantize_op = g->CreateOpNode(&q_desc); // OpDesc will be copied. // link quantize op @@ -154,6 +159,7 @@ void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name, op->Op()->SetInput(input_name, quantize_out_node_names); if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale); + if (!shift_attr_name.empty()) op->Op()->SetAttr(shift_attr_name, shift); } void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output, @@ -782,6 +788,62 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const { quantize_elementwise_add_count); } +void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const { + GraphPatternDetector gpd; + patterns::FusionGru pattern{gpd.mutable_pattern(), name_scope_}; + pattern(); + + int quantize_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "Quantize fusion_gru op"; + GET_IR_NODE_FROM_SUBGRAPH(op, op, pattern); + + // skip if should not be quantized + if (!platform::HasOpINT8DataType(op->Op())) { + LogQuantizationDisabled(op); + return; + } + + GET_IR_NODE_FROM_SUBGRAPH(x, x, pattern); + GET_IR_NODE_FROM_SUBGRAPH(weight_h, weight_h, pattern); + GET_IR_NODE_FROM_SUBGRAPH(weight_x, weight_x, pattern); + GET_IR_NODE_FROM_SUBGRAPH(out, out, pattern); + + if (!AreScalesPresentForNodes(op, {x, weight_h, weight_x})) { + LogCannotQuantizeOp(op); + return; + } + + bool is_x_unsigned{false}; + auto input_x_scale = GetScaleValueForNode(x, &is_x_unsigned); + + double input_x_shift{128.}; + if (is_x_unsigned) input_x_shift = 0.; + + QuantizeInput(g, op, x, "X", input_x_scale, is_x_unsigned, "Scale_data", + input_x_shift, "Shift_data"); + + auto weight_scale_tensor = GetScaleTensorForNode(weight_x); + EigenVectorArrayMap eigen_tensor{weight_scale_tensor.data(), + weight_scale_tensor.numel(), 1}; + eigen_tensor *= static_cast(S8_MAX); + std::vector scale_weights{ + weight_scale_tensor.data(), + weight_scale_tensor.data() + weight_scale_tensor.numel()}; + + op->Op()->SetAttr("Scale_weights", scale_weights); + // return fp32 data + op->Op()->SetAttr("force_fp32_output", true); + + ++quantize_count; + }; + gpd(graph, handler); + AddStatis(quantize_count); + + PrettyLogDetail("--- quantized %d fusion_gru ops", quantize_count); +} + void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Quantizing the graph."; PADDLE_ENFORCE_NOT_NULL( @@ -801,6 +863,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { QuantizeReshape(graph); QuantizeMatmul(graph); QuantizeElementwiseAdd(graph); + QuantizeFusionGru(graph); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index bd87b31b781ec..0d4c424901081 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -49,31 +49,26 @@ class CPUQuantizePass : public FusePassBase { void ApplyImpl(ir::Graph* graph) const override; void QuantizeConv(Graph* graph, bool with_residual_data = false) const; - void QuantizeFc(Graph* graph) const; - void QuantizePool(Graph* graph) const; - void QuantizeConcat(Graph* graph) const; - void QuantizePriorBox(Graph* graph) const; - void QuantizeTranspose(Graph* graph) const; - void QuantizeReshape(Graph* graph) const; - void QuantizeMatmul(Graph* graph) const; - void QuantizeElementwiseAdd(Graph* graph) const; + void QuantizeFusionGru(Graph* graph) const; void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name, - double scale_to_one, bool is_unsigned, - std::string scale_attr_name = "") const; + double scale_to_one, bool is_input_unsigned, + std::string scale_attr_name = "", float shift = 0.0, + std::string shift_attr_name = "") const; // quantize all inputs of given name with the same (minimum) scale void QuantizeInputs(Graph* g, Node* op, std::string input_name, - bool are_unsigned, - std::string scale_attr_name = "") const; + bool are_inputs_unsigned, + std::string scale_attr_name = "", float shift = 0.0, + std::string shift_attr_name = "") const; void DequantizeOutput(Graph* g, Node* op, Node* output, std::string output_name, double scale_to_one, diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index a66e9f0e93898..65be404dfef2f 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -91,6 +91,16 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetAttr("Scale_x", 1.0f); op->SetAttr("Scale_y", 1.0f); op->SetAttr("Scale_out", 1.0f); + } else if (type == "fusion_gru") { + op->SetInput("X", {inputs[0]}); + op->SetInput("Bias", {inputs[1]}); + op->SetInput("WeightX", {inputs[2]}); + op->SetInput("WeightH", {inputs[3]}); + op->SetOutput("Hidden", {outputs[0]}); + op->SetAttr("mkldnn_data_type", mkldnn_data_type); + op->SetAttr("Scale_data", 1.0f); + op->SetAttr("Shift_data", 0.0f); + op->SetAttr("Weight_scale", std::vector{1.0f}); } } @@ -389,6 +399,77 @@ TEST(CpuQuantizePass, transpose) { quant_count, dequant_count, added_nodes_count, 2.0f * 127); } +static const std::initializer_list variable_names_fusion_gru = { + "x", "wx", "wh", "b", "h"}; + +// x->Fusion_gru->h +ProgramDesc BuildProgramDescFusionGru() { + ProgramDesc prog; + for (auto& v : variable_names_transpose) { + auto* var = prog.MutableBlock(0)->Var(v); + if (v.find("wx") == 0 || v.find("wh") || v.find("b")) { + var->SetPersistable(true); + } + } + + SetOp(&prog, "fusion_gru", "Fusion_gru", {"x", "wx", "wh", "b"}, {"h"}, true, + "int8"); + + return prog; +} + +void MainTestFusionGru(const ProgramDesc& prog, int gru_count, int quant_count, + int dequant_count, int added_nodes_count, float scale, + float shift) { + std::unique_ptr graph(new ir::Graph(prog)); + int original_nodes_num, current_nodes_num; + PreparePass(&graph, prog, variable_names_fusion_gru, &original_nodes_num, + ¤t_nodes_num); + + int quantize_nodes_count = 0; + int dequantize_nodes_count = 0; + int gru_nodes_count = 0; + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + if (op->Type() == "fusion_gru") { + gru_nodes_count++; + + auto op_name = BOOST_GET_CONST(std::string, op->GetAttr("name")); + EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_data")), scale) + << "Scale_data for node '" + op_name + "'."; + EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Shift_data")), shift) + << "Shift_data for node '" + op_name + "'."; + EXPECT_EQ(BOOST_GET_CONST(std::vector, + op->GetAttr("Scale_weights"))[0], + scale) + << "Scale_weights for node '" + op_name + "'."; + EXPECT_EQ(BOOST_GET_CONST(bool, op->GetAttr("force_fp32_output")), true) + << "force_fp32_output for node '" + op_name + "'."; + } else if (op->Type() == "quantize") { + quantize_nodes_count++; + } else if (op->Type() == "dequantize") { + dequantize_nodes_count++; + } + } + } + EXPECT_EQ(gru_nodes_count, gru_count); + EXPECT_EQ(quantize_nodes_count, quant_count); + EXPECT_EQ(dequantize_nodes_count, dequant_count); + EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num); +} + +TEST(CpuQuantizePass, fusion_gru) { + // x->Fusion_gru->h + int gru_count = 1; + int quant_count = 1; + int dequant_count = 0; + // 1 Quant + 1 IN + 0 DeQuant + 0 OUT + int added_nodes_count = 1 + 1 + 0 + 0; + MainTestFusionGru(BuildProgramDescFusionGru(), gru_count, quant_count, + dequant_count, added_nodes_count, 2. * 127, 128.); +} + static const std::initializer_list variable_names_reshape = { "a", "w1", "b", "c", "d", "e", "f"}; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc index 54ab244a99bd4..d6146f264ab8d 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc @@ -76,6 +76,8 @@ void CPUQuantizeSquashPass::DequantQuantSquash( BOOST_GET_CONST(float, dequant_op->Op()->GetAttr("Scale")); float quant_scale = BOOST_GET_CONST(float, quant_op->Op()->GetAttr("Scale")); + float dequant_shift = dequant_op->Op()->GetAttrIfExists("Shift"); + float quant_shift = quant_op->Op()->GetAttrIfExists("Shift"); PADDLE_ENFORCE_NE( nodes_keep_counter->find(dequant_out), nodes_keep_counter->end(), platform::errors::NotFound("The dequant output node is not found.")); @@ -83,7 +85,7 @@ void CPUQuantizeSquashPass::DequantQuantSquash( // check if dequantize op should be kept or removed, decrease the counter bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1; - if (dequant_scale == quant_scale) { + if (dequant_scale == quant_scale && dequant_shift == quant_shift) { // squash dequantize-quantize to nothing auto quant_out_var_name = quant_out->Name(); auto next_op_inputs = next_op_desc->InputNames(); @@ -110,7 +112,9 @@ void CPUQuantizeSquashPass::DequantQuantSquash( desc.SetInput("Input", std::vector({dequant_in->Name()})); desc.SetOutput("Output", std::vector({quant_out->Name()})); desc.SetAttr("Scale_in", dequant_scale); + desc.SetAttr("Shift_in", dequant_shift); desc.SetAttr("Scale_out", quant_scale); + desc.SetAttr("Shift_out", quant_shift); auto requant_op = g->CreateOpNode(&desc); @@ -293,6 +297,7 @@ void CPUQuantizeSquashPass::MultipleQuantizeSquash(Graph* graph) const { })); auto* first_quant_out = first_quant_op->outputs[0]; float scale = first_quant_op->Op()->GetAttrIfExists("Scale"); + float shift = first_quant_op->Op()->GetAttrIfExists("Shift"); PADDLE_ENFORCE_NE(scale, 0, platform::errors::InvalidArgument( @@ -302,7 +307,8 @@ void CPUQuantizeSquashPass::MultipleQuantizeSquash(Graph* graph) const { auto quant_op = prev_out->outputs[iter]; if (quant_op->IsOp() && quant_op->Op()->Type() == "quantize" && quant_op->id() != first_quant_op->id() && - quant_op->Op()->GetAttrIfExists("Scale") == scale) { + quant_op->Op()->GetAttrIfExists("Scale") == scale && + quant_op->Op()->GetAttrIfExists("Shift") == shift) { auto quant_out = quant_op->outputs[0]; auto last_op = quant_out->outputs[0]; diff --git a/paddle/fluid/framework/op_compatible_info.cc b/paddle/fluid/framework/op_compatible_info.cc index 826e14dedb76d..93826fc97b196 100644 --- a/paddle/fluid/framework/op_compatible_info.cc +++ b/paddle/fluid/framework/op_compatible_info.cc @@ -182,40 +182,5 @@ OpCompatibleType OpCompatibleMap::IsRequireMiniVersion( } } -bool OpCompatibleMap::ConvertToProto(proto::OpCompatibleMap* desc) const { - desc->Clear(); - desc->set_default_required_version(default_required_version_); - for (auto pair : op_compatible_map_) { - const CompatibleInfo& info = pair.second; - auto* pair_desc = desc->add_pair(); - pair_desc->set_op_name(pair.first); - auto* info_desc = pair_desc->mutable_compatible_info(); - info_desc->set_version(info.required_version_); - info_desc->set_type( - static_cast(info.compatible_type_)); - } - return true; -} - -bool OpCompatibleMap::ReadFromProto(const proto::OpCompatibleMap& desc) { - std::string version = desc.default_required_version(); - if (version.empty()) { - LOG(INFO) << "The default operator required version is missing." - " Please update the model version."; - return false; - } - op_compatible_map_.clear(); - default_required_version_ = desc.default_required_version(); - for (int i = 0; i < desc.pair_size(); ++i) { - const auto& pair_desc = desc.pair(i); - auto info_desc = pair_desc.compatible_info(); - CompatibleInfo info(info_desc.version(), - static_cast(info_desc.type())); - std::pair pair(pair_desc.op_name(), info); - op_compatible_map_.insert(pair); - } - return true; -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/op_compatible_info.h b/paddle/fluid/framework/op_compatible_info.h index 01fbdef99cbbc..6f86b8b64ed21 100644 --- a/paddle/fluid/framework/op_compatible_info.h +++ b/paddle/fluid/framework/op_compatible_info.h @@ -58,14 +58,6 @@ class OpCompatibleMap { OpCompatibleType IsRequireMiniVersion(std::string op_name, std::string current_version) const; - // Convert the entire OpCompatibleMap to Proto, which can be serialized - // to the model file as part of the ProgramDesc. - bool ConvertToProto(proto::OpCompatibleMap* desc) const; - - // Read and reset the entire object from proto, which can be read from - // the model file as part of the program. - bool ReadFromProto(const proto::OpCompatibleMap& desc); - const std::string& GetDefaultRequiredVersion() const { return default_required_version_; } diff --git a/paddle/fluid/framework/op_compatible_info_test.cc b/paddle/fluid/framework/op_compatible_info_test.cc index 98f3f5071ad28..cf210ed8ab2d5 100644 --- a/paddle/fluid/framework/op_compatible_info_test.cc +++ b/paddle/fluid/framework/op_compatible_info_test.cc @@ -28,12 +28,6 @@ TEST(test_op_compatible_info, test_op_compatible) { auto comp_map = OpCompatibleMap(); comp_map.InitOpCompatibleMap(); - // Ensure save-load consistency. - auto program_desc = ProgramDesc(); - proto::OpCompatibleMap* proto_map = program_desc.OpCompatibleMap(); - comp_map.ConvertToProto(proto_map); - comp_map.ReadFromProto(*proto_map); - ASSERT_NE(comp_map.GetDefaultRequiredVersion(), std::string()); ASSERT_NE(comp_map.GetOpCompatibleInfo("sequence_pad").required_version_, std::string()); diff --git a/paddle/fluid/framework/op_version_proto.cc b/paddle/fluid/framework/op_version_proto.cc new file mode 100644 index 0000000000000..696e322380740 --- /dev/null +++ b/paddle/fluid/framework/op_version_proto.cc @@ -0,0 +1,15 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_version_proto.h" diff --git a/paddle/fluid/framework/op_version_proto.h b/paddle/fluid/framework/op_version_proto.h new file mode 100644 index 0000000000000..1a876f43d2f00 --- /dev/null +++ b/paddle/fluid/framework/op_version_proto.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/framework.pb.h" + +namespace paddle { +namespace framework { +namespace compatible { +namespace pb { + +class OpVersion { + public: + explicit OpVersion(proto::OpVersion* desc) : desc_{desc} {} + void SetVersionID(uint32_t version) { desc_->set_version(version); } + + private: + proto::OpVersion* desc_; +}; + +class OpVersionMap { + public: + explicit OpVersionMap(proto::OpVersionMap* desc) : desc_{desc} {} + OpVersion operator[](const std::string& key) { + for (int i = 0; i < desc_->pair_size(); ++i) { + if (desc_->pair(i).op_name() == key) { + return OpVersion(desc_->mutable_pair(i)->mutable_op_version()); + } + } + auto* pair = desc_->add_pair(); + pair->set_op_name(key); + return OpVersion(pair->mutable_op_version()); + } + + private: + proto::OpVersionMap* desc_; +}; + +} // namespace pb +} // namespace compatible +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_version_registry.cc b/paddle/fluid/framework/op_version_registry.cc index 11b7224e68340..9a67c160f0233 100644 --- a/paddle/fluid/framework/op_version_registry.cc +++ b/paddle/fluid/framework/op_version_registry.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h index fea043a0ff311..5ddaf1bd8d8ce 100644 --- a/paddle/fluid/framework/op_version_registry.h +++ b/paddle/fluid/framework/op_version_registry.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/op_version_proto.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -159,12 +160,14 @@ class OpVersionRegistrar { op_version_map_.insert({op_type, OpVersion()}); return op_version_map_[op_type]; } + const std::unordered_map& GetVersionMap() { + return op_version_map_; + } uint32_t GetVersionID(const std::string& op_type) const { auto it = op_version_map_.find(op_type); if (it == op_version_map_.end()) { return 0; } - return it->second.GetVersionID(); } @@ -175,6 +178,14 @@ class OpVersionRegistrar { OpVersionRegistrar& operator=(const OpVersionRegistrar&) = delete; }; +inline void SaveOpVersions( + const std::unordered_map& src, + pb::OpVersionMap* dst) { + for (const auto& pair : src) { + (*dst)[pair.first].SetVersionID(pair.second.GetVersionID()); + } +} + class OpVersionComparator { public: virtual bool operator()() = 0; diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc index d6b18751cefe5..2b173c9571588 100644 --- a/paddle/fluid/framework/op_version_registry_test.cc +++ b/paddle/fluid/framework/op_version_registry_test.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc index d37a16a3e7d9f..0faa870f50565 100644 --- a/paddle/fluid/framework/program_desc.cc +++ b/paddle/fluid/framework/program_desc.cc @@ -39,8 +39,8 @@ proto::ProgramDesc *ProgramDesc::Proto() { return &desc_; } -proto::OpCompatibleMap *ProgramDesc::OpCompatibleMap() { - return desc_.mutable_op_compatible_map(); +proto::OpVersionMap *ProgramDesc::OpVersionMap() { + return desc_.mutable_op_version_map(); } int64_t ProgramDesc::Version() const { return desc_.version().version(); } diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h index 5cafc9111da67..8b1aac95fc288 100644 --- a/paddle/fluid/framework/program_desc.h +++ b/paddle/fluid/framework/program_desc.h @@ -58,7 +58,7 @@ class ProgramDesc { proto::ProgramDesc *Proto(); - proto::OpCompatibleMap *OpCompatibleMap(); + proto::OpVersionMap *OpVersionMap(); int64_t Version() const; diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index a073dbd733f0b..4fe01aff79e52 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -38,6 +38,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, dst->Resize(src.dims()); dst->set_layout(src.layout()); +#ifdef PADDLE_WITH_MKLDNN + dst->set_format(src.format()); +#endif auto src_place = src.place(); auto src_ptr = src.data(); auto dst_ptr = dst->mutable_data(dst_place, src.type()); @@ -237,6 +240,9 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, src.check_memory_size(); dst->Resize(src.dims()); dst->set_layout(src.layout()); +#ifdef PADDLE_WITH_MKLDNN + dst->set_format(src.format()); +#endif auto src_place = src.place(); auto src_ptr = src.data(); auto dst_ptr = dst->mutable_data(dst_place, src.type()); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index f85e1f6511656..6d35d3395ba60 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -88,7 +88,7 @@ if(NOT APPLE AND NOT WIN32) set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}") # check symbol hidden FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake - "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh" + "execute_process(COMMAND sh -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh" " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_fluid.so\" RESULT_VARIABLE symbol_res)\n" "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n" " message(FATAL_ERROR \"Check symbol failed.\")\n" diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc index e78d5ef017b7f..2c454893a6203 100644 --- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc @@ -252,7 +252,11 @@ void LiteSubgraphPass::SetUpEngine( } else if (use_xpu) { target_type = TARGET(kXPU); } else { +#ifdef PADDLE_WITH_ARM + target_type = TARGET(kARM); +#else target_type = TARGET(kX86); +#endif } paddle::lite_api::PrecisionType precision_type = diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 6c68b385bcbc0..98bee2d4bb471 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -192,11 +192,6 @@ bool AnalysisPredictor::PrepareProgram( // If config_.ir_optim() is False, parameters is loaded in LoadParameters(), // still need to create other persistable variables. // So in both case, create persistable variables at first. - if (!CheckOperatorCompatible()) { - LOG(WARNING) << "WARNING: Results may be DIFF! " - "Please use the corresponding version of the model and " - "prediction library, and do not use the develop branch."; - } executor_->CreateVariables(*inference_program_, 0, true, sub_scope_); // if enable_ir_optim_ is false, @@ -998,40 +993,6 @@ std::string AnalysisPredictor::GetSerializedProgram() const { return inference_program_->Proto()->SerializeAsString(); } -bool AnalysisPredictor::CheckOperatorCompatible() { - if (!inference_program_) { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Inference program version check failed because the program does not " - "exist.")); - return false; - } - bool res = true; - op_compatible_map_.ReadFromProto(*inference_program_->OpCompatibleMap()); - const auto &version = framework::DumpVersion(framework::kCurProgramVersion); - LOG(INFO) << "MODEL VERSION: " - << framework::DumpVersion(inference_program_->Version()); - LOG(INFO) << "PREDICTOR VERSION: " << version; - std::set op_types; - for (size_t i = 0; i < inference_program_->Size(); ++i) { - const auto &block = inference_program_->Block(i); - for (const auto *op : block.AllOps()) { - op_types.insert(op->Type()); - } - } - for (const auto type : op_types) { - auto compatible_type = - op_compatible_map_.IsRequireMiniVersion(type, version); - if (compatible_type != framework::OpCompatibleType::compatible) { - if (!framework::kCurProgramVersion) { - LOG(WARNING) << " - Version incompatible (" - << static_cast(compatible_type) << ") " << type; - } - res = false; - } - } - return res; -} - // Add SaveOptimModel void AnalysisPredictor::SaveOptimModel(const std::string &dir) { // save model diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index c4a7173b0104b..269f2fd80bb47 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -335,13 +335,6 @@ class AnalysisPredictor : public PaddlePredictor { /// AnalysisPredictor::ZeroCopyRun() now. /// void MkldnnPostReset(); - /// - /// \brief Compute compatibility based on model version information and - /// operator version information - /// - /// \return Compatible information - /// - bool CheckOperatorCompatible(); #if PADDLE_WITH_TENSORRT /// diff --git a/paddle/fluid/inference/check_symbol.sh b/paddle/fluid/inference/check_symbol.sh index b6b7d1f20baf7..a0f64796576c8 100755 --- a/paddle/fluid/inference/check_symbol.sh +++ b/paddle/fluid/inference/check_symbol.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh lib=$1 if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index 33661594b926f..7b909b3f84205 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -46,6 +46,7 @@ platform::Place GetNativePlace(const TargetType& type, int id = 0) { switch (type) { case TargetType::kHost: case TargetType::kX86: + case TargetType::kARM: return platform::CPUPlace(); case TargetType::kCUDA: return platform::CUDAPlace(id); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index ac0a04b9a116d..4a386ac1d81c5 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -164,6 +164,7 @@ class OpConverter { const std::unordered_set& parameters, const std::vector& outputs, TensorRTEngine* engine) { engine->InitNetwork(); + bool all_dynamic_shape_set = true; for (auto& input : inputs) { if (parameters.count(input)) continue; auto* var = block_desc->FindVar(input); @@ -181,6 +182,13 @@ class OpConverter { auto max_input_shape = engine->max_input_shape()[input]; auto optim_input_shape = engine->optim_input_shape()[input]; size_t ranks = min_input_shape.size(); + if (ranks == 0) { + all_dynamic_shape_set = false; + LOG(INFO) << "trt input [" << input.c_str() + << "] dynamic shape info not set, please check and retry."; + // check other input + continue; + } std::vector input_shape; input_shape.push_back(-1); for (size_t i = 1; i < ranks; i++) { @@ -207,6 +215,10 @@ class OpConverter { Vec2TRT_Dims(var_shape, input)); } } + PADDLE_ENFORCE_EQ(all_dynamic_shape_set, true, + platform::errors::InvalidArgument( + "some trt inputs dynamic shape info not set, " + "check the INFO log above for more details.")); framework::proto::BlockDesc* block_proto = block_desc->Proto(); ConvertBlock(*block_proto, parameters, scope, engine); for (auto& output : outputs) { diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 723e989be8de8..252bca2d5522e 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -65,6 +65,7 @@ DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch."); DEFINE_bool(warmup, false, "Use warmup to calculate elapsed_time more accurately. " "To reduce CI time, it sets false in default."); +DEFINE_int32(warmup_iters, 1, "Number of batches to process during warmup."); DEFINE_bool(enable_profile, false, "Turn on profiler for fluid"); DEFINE_int32(cpu_num_threads, 1, "Number of threads for each paddle instance."); @@ -364,15 +365,28 @@ void PredictionWarmUp(PaddlePredictor *predictor, if (FLAGS_zero_copy) { ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]); } - outputs->resize(1); + int iterations = 1; + if (FLAGS_warmup_iters > 1) + iterations = std::min(FLAGS_warmup_iters, static_cast(inputs.size())); + outputs->resize(iterations); Timer warmup_timer; - warmup_timer.tic(); + double elapsed_time = 0; if (!FLAGS_zero_copy) { - predictor->Run(inputs[0], &(*outputs)[0], batch_size); + for (int i = 0; i < iterations; ++i) { + warmup_timer.tic(); + predictor->Run(inputs[i], &(*outputs)[i], batch_size); + elapsed_time += warmup_timer.toc(); + } } else { - predictor->ZeroCopyRun(); + for (int i = 0; i < iterations; ++i) { + warmup_timer.tic(); + predictor->ZeroCopyRun(); + elapsed_time += warmup_timer.toc(); + } } - PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1, data_type); + auto batch_latency = elapsed_time / iterations; + PrintTime(batch_size, 1, num_threads, tid, batch_latency, iterations, + data_type); if (FLAGS_enable_profile) { paddle::platform::ResetProfiler(); } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 53e6f4aa6e41b..5fa8f6bab8cca 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -144,4 +144,5 @@ cc_test(op_debug_string_test SRCS op_debug_string_test.cc DEPS elementwise_add_o if(WITH_MKLDNN) include(mkldnn/inplace_op_tests.cmake) +include(mkldnn/nhwc_op_tests.cmake) endif() diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc index 1903b9e30d800..26ad09cc265f1 100644 --- a/paddle/fluid/operators/activation_cudnn_op.cu.cc +++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc @@ -41,7 +41,7 @@ struct CudnnActivationFunctor { TensorDescriptor x_desc, out_desc; x_desc.set(x); out_desc.set(GET_DATA_SAFELY(out, "Output", "Out", "CudnnActivation")); - PADDLE_ENFORCE(platform::dynload::cudnnActivationForward( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnActivationForward( ctx_.cudnn_handle(), act_desc.desc(), platform::CudnnDataType::kOne(), x_desc.desc(), x.data(), platform::CudnnDataType::kZero(), out_desc.desc(), @@ -67,7 +67,7 @@ struct CudnnActivationGradFunctor { out_desc.set(out); dout_desc.set(dout); dx_desc.set(GET_DATA_SAFELY(dx, "Output", "X@GRAD", "CudnnActivationGrad")); - PADDLE_ENFORCE(platform::dynload::cudnnActivationBackward( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnActivationBackward( ctx_.cudnn_handle(), act_desc.desc(), platform::CudnnDataType::kOne(), out_desc.desc(), out.data(), dout_desc.desc(), dout.data(), x_desc.desc(), x.data(), diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc index fca3c531b4055..8bd76a9886c62 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc @@ -103,7 +103,7 @@ class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker { .AddCustomChecker([](float decr_ratio) { PADDLE_ENFORCE_EQ(decr_ratio > 0.0f && decr_ratio < 1.0f, true, platform::errors::InvalidArgument( - "'incr_ratio' should be between 0 and 1, but " + "'decr_ratio' should be between 0 and 1, but " "the received is %f", decr_ratio)); }); diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index 3cb3f1d48bfa7..4bf4ba1120df0 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -117,7 +117,8 @@ void BeamSearchDecodeFunctor::apply() const { template <> void BeamSearchDecodeFunctor::apply() const { - PADDLE_THROW("beam search decode op does not support bool!"); + PADDLE_THROW(platform::errors::InvalidArgument( + "beam search decode op does not support bool!")); } class BeamSearchDecodeOp : public framework::OperatorBase { diff --git a/paddle/fluid/operators/bilateral_slice_op.cc b/paddle/fluid/operators/bilateral_slice_op.cc index b742b4c0deea8..b00604155d67e 100644 --- a/paddle/fluid/operators/bilateral_slice_op.cc +++ b/paddle/fluid/operators/bilateral_slice_op.cc @@ -50,20 +50,25 @@ class BilateralSliceOp : public framework::OperatorWithKernel { int64_t input_chans = input_dims[1]; int64_t output_chans; - if (has_offset) { - PADDLE_ENFORCE_EQ((coeffs_chans % (input_chans + 1)), 0, - platform::errors::InvalidArgument( - "Slicing with affine offset, coefficients grid " - "should have n_out*(n_in+1) channels, but got %d", - coeffs_chans)); - output_chans = coeffs_chans / (input_chans + 1); + if ((!ctx->IsRuntime()) && ((coeffs_chans < 0) || (input_chans < 0))) { + output_chans = -1; } else { - PADDLE_ENFORCE_EQ((coeffs_chans % input_chans), 0, - platform::errors::InvalidArgument( - "Slicing without affine offset, coefficients grid " - "should have n_out*n_in channels, but got %d .", - coeffs_chans)); - output_chans = coeffs_chans / input_chans; + if (has_offset) { + PADDLE_ENFORCE_EQ((coeffs_chans % (input_chans + 1)), 0, + platform::errors::InvalidArgument( + "Slicing with affine offset, coefficients grid " + "should have n_out*(n_in+1) channels, but got %d", + coeffs_chans)); + output_chans = coeffs_chans / (input_chans + 1); + } else { + PADDLE_ENFORCE_EQ( + (coeffs_chans % input_chans), 0, + platform::errors::InvalidArgument( + "Slicing without affine offset, coefficients grid " + "should have n_out*n_in channels, but got %d .", + coeffs_chans)); + output_chans = coeffs_chans / input_chans; + } } std::vector output_dims; diff --git a/paddle/fluid/operators/chunk_eval_op.h b/paddle/fluid/operators/chunk_eval_op.h index bee3ab37448e8..555130fe85268 100644 --- a/paddle/fluid/operators/chunk_eval_op.h +++ b/paddle/fluid/operators/chunk_eval_op.h @@ -146,7 +146,7 @@ class ChunkEvalKernel : public framework::OpKernel { tag_end = -1; tag_single = -1; } else { - PADDLE_THROW("Unknown chunk scheme."); + PADDLE_THROW(platform::errors::InvalidArgument("Unknown chunk scheme.")); } other_chunk_type = num_chunk_types = context.Attr("num_chunk_types"); excluded_chunk_types.insert( diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index b85e740ada9bd..b8ecbe8ab4a9f 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include + #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -70,6 +72,23 @@ class WhileOp : public framework::OperatorBase { auto *block = Attr(kStepBlock); auto *program = block->Program(); + bool is_test = Attr("is_test"); + + std::set no_copy_var_names; + if (!is_test) { + const std::vector &all_ops = block->AllOps(); + for (const framework::OpDesc *op : all_ops) { + const framework::VariableNameMap &input_var_names = op->Inputs(); + const framework::VariableNameMap &output_var_names = op->Outputs(); + for (auto &ipt : input_var_names) { + for (const std::string &var_name : ipt.second) { + if (StrInVaraiableNameMap(var_name, output_var_names)) { + no_copy_var_names.insert(var_name); + } + } + } + } + } auto step_scopes = scope.FindVar(Output(kStepScopes))->GetMutable(); @@ -89,7 +108,6 @@ class WhileOp : public framework::OperatorBase { "The Output(StepScope) of WhileOp should be empty.")); bool cond_data = GetCondData(cond); - bool is_test = Attr("is_test"); auto &skip_vars = Attr>(kSkipEagerDeletionVars); VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); @@ -98,8 +116,32 @@ class WhileOp : public framework::OperatorBase { while (cond_data) { auto ¤t_scope = scope.NewScope(); step_scopes->push_back(¤t_scope); + + std::vector rename_vars; + for (const std::string &input_var_name : Inputs(kX)) { + if (no_copy_var_names.find(input_var_name) == + no_copy_var_names.end()) { + std::string input_var_rename = input_var_name + kSuffix; + framework::Variable *input_var = scope.FindVar(input_var_name); + if (input_var->IsType()) { + rename_vars.push_back(input_var_rename); + auto input_var_tensor = input_var->Get(); + auto *rename_input_var_tensor = + current_scope.Var(input_var_rename)->GetMutable(); + framework::TensorCopy(input_var_tensor, dev_place, + rename_input_var_tensor); + rename_input_var_tensor->set_lod(input_var_tensor.lod()); + } + } + } executor.RunPreparedContext(ctx.get(), ¤t_scope, false, true, true); + + for (auto &var_rename : rename_vars) { + std::string input_var_name = + var_rename.substr(0, var_rename.size() - strlen(kSuffix)); + current_scope.Rename(var_rename, input_var_name); + } cond_data = GetCondData(scope.FindVar(Input(kCondition))->Get()); } @@ -312,6 +354,10 @@ class WhileGradOp : public framework::OperatorBase { // continue; // } + auto var_iter = + std::find(outside_og_names.begin(), outside_og_names.end(), + pg_ig_names[param_id]); + // zero gradient variable in step 0 if (cur_scope_iter == step_scopes->rbegin()) { auto *var = (*cur_scope_iter)->FindVar(inside_grad_name); @@ -326,7 +372,8 @@ class WhileGradOp : public framework::OperatorBase { "or LoDTensor, but the received var[%s] is %s.", inside_grad_name, framework::ToTypeName(var->Type()))); - if (var->IsType()) { + if ((var_iter == outside_og_names.end()) && + var->IsType()) { auto &inside_tensor = var->Get(); framework::AttributeMap attrs; attrs["dtype"] = inside_tensor.type(); @@ -343,13 +390,18 @@ class WhileGradOp : public framework::OperatorBase { ->set_lod(inside_tensor.lod()); } } - auto new_inside_name = cur_scope.Rename(inside_grad_name); - auto sum_op = framework::OpRegistry::CreateOp( - "sum", {{"X", {pg_ig_names[param_id], new_inside_name}}}, - {{"Out", {pg_ig_names[param_id]}}}, - framework::AttributeMap{{"use_mkldnn", {false}}}); - sum_op->Run(cur_scope, dev_place); - cur_scope.Rename(new_inside_name, inside_grad_name); + auto var_outside = scope.FindVar(pg_ig_names[param_id]); + if ((var_iter == outside_og_names.end()) || + ((var_iter != outside_og_names.end()) && + var_outside->IsType())) { + auto new_inside_name = cur_scope.Rename(inside_grad_name); + auto sum_op = framework::OpRegistry::CreateOp( + "sum", {{"X", {pg_ig_names[param_id], new_inside_name}}}, + {{"Out", {pg_ig_names[param_id]}}}, + framework::AttributeMap{{"use_mkldnn", {false}}}); + sum_op->Run(cur_scope, dev_place); + cur_scope.Rename(new_inside_name, inside_grad_name); + } } dev_ctx.Wait(); const_cast(scope).DeleteScope(&cur_scope); diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc index a3fe71f3ec8b3..b8e9f9f36ac81 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.cc +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -232,5 +232,16 @@ bool GetCondData(const framework::LoDTensor &cond) { return cpu_cond->data()[0]; } +bool StrInVaraiableNameMap(const std::string &name, + const framework::VariableNameMap &var_names) { + for (auto &ipt : var_names) { + if (std::find(ipt.second.begin(), ipt.second.end(), name) != + ipt.second.end()) { + return true; + } + } + return false; +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h index d2e9953e6477a..8b4a14570b1ef 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.h +++ b/paddle/fluid/operators/controlflow/while_op_helper.h @@ -38,6 +38,7 @@ static constexpr char kX[] = "X"; static constexpr char kXGRAD[] = "X@GRAD"; static constexpr char kOutputs[] = "Out"; static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; +static constexpr char kSuffix[] = "@TMP_COPY"; void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( const framework::ProgramDesc &program, int block_id, @@ -50,5 +51,8 @@ void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( bool GetCondData(const framework::LoDTensor &cond); +bool StrInVaraiableNameMap(const std::string &, + const framework::VariableNameMap &); + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc index 82954bc109a74..31f0c26a3f3a1 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" namespace paddle { namespace operators { @@ -25,7 +26,6 @@ class CudnnLSTMOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "CudnnLSTM"); - OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CudnnLSTM"); OP_INOUT_CHECK(ctx->HasInput("InitH"), "Input", "InitH", "CudnnLSTM"); OP_INOUT_CHECK(ctx->HasInput("InitC"), "Input", "InitC", "CudnnLSTM"); @@ -122,7 +122,13 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("W", "(Tensor) the learnable hidden-hidden weights." " The shape is (N), where N is total weight size of the LSTM. " - " cudnn concatenate all the weight to one Tensor"); + " cudnn concatenate all the weight to one Tensor") + .AsDispensable(); + AddInput("WeightList", + "(vector), stores weight and bias data when the weight " + "use the list format. ") + .AsDispensable() + .AsDuplicable(); AddInput("SequenceLength", "(Tensor) When the input data is padding, " "set this parameter. This parameter represents " @@ -216,7 +222,6 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "CudnnLSTMGrad"); - OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CudnnLSTMGrad"); OP_INOUT_CHECK(ctx->HasInput("InitH"), "Input", "InitH", "CudnnLSTMGrad"); OP_INOUT_CHECK(ctx->HasInput("InitC"), "Input", "InitC", "CudnnLSTMGrad"); @@ -228,7 +233,10 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel { }; SetOutGradDim("Input"); - SetOutGradDim("W"); + if (ctx->HasInputs("WeightList")) { + ctx->SetOutputsDim(framework::GradVarName("WeightList"), + ctx->GetInputsDim("WeightList")); + } SetOutGradDim("InitH"); SetOutGradDim("InitC"); } @@ -251,7 +259,9 @@ class CudnnLSTMGradOpMaker : public framework::SingleGradOpMaker { op->SetInput("Input", this->Input("Input")); op->SetInput("InitH", this->Input("InitH")); op->SetInput("InitC", this->Input("InitC")); - op->SetInput("W", this->Input("W")); + if (this->HasInput("WeightList")) { + op->SetInput("WeightList", this->Input("WeightList")); + } if (this->HasInput("SequenceLength")) { op->SetInput("SequenceLength", this->Input("SequenceLength")); } @@ -262,8 +272,12 @@ class CudnnLSTMGradOpMaker : public framework::SingleGradOpMaker { op->SetInput(framework::GradVarName("LastC"), this->OutputGrad("LastC")); op->SetInput(framework::GradVarName("LastH"), this->OutputGrad("LastH")); + if (this->HasInput("WeightList")) { + op->SetOutput(framework::GradVarName("WeightList"), + this->InputGrad("WeightList", false)); + } + op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input")); - op->SetOutput(framework::GradVarName("W"), this->InputGrad("W")); op->SetOutput(framework::GradVarName("InitH"), this->InputGrad("InitH")); op->SetOutput(framework::GradVarName("InitC"), this->InputGrad("InitC")); op->SetAttrMap(this->Attrs()); @@ -274,8 +288,8 @@ template class NotImpleKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_THROW( - "CPU is not support for this kernel now. Will be add in the future"); + PADDLE_THROW(platform::errors::Unimplemented( + "CPU is not support for this kernel now. Will be add in the future")); } }; @@ -290,3 +304,20 @@ REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp); REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel); REGISTER_OP_CPU_KERNEL(cudnn_lstm_grad, ops::NotImpleKernel); + +// TODO(Shixiaowei02) Add ModifyInput support +REGISTER_OP_VERSION(cudnn_lstm) + .AddCheckpoint( + R"ROC( + Upgrade cudnn_lstm add a new input [WeightList] and modify input [W] to dispensable.)ROC", + paddle::framework::compatible::OpVersionDesc() + .NewInput( + "WeightList", + "The WeightList stores weight and bias data. WeightList is " + "dispensable.") + .NewInput("SequenceLength", + "When the input data is padding, set this parameter. " + "SequenceLength is dispensable.") + .NewOutput("StateOut", "Store the global drop state when training") + .NewOutput("Reserve", + "A temporary output Tensor to store the reserve_data")); diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index 6ac75b78d7058..bea7d9c02ca7d 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -30,6 +30,66 @@ namespace operators { using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; +template +bool is_continuous(const Type &weight_list) { + bool continuous = true; + for (size_t i = 0; i < weight_list.size() - 1; ++i) { + auto *in_data = weight_list[i]->template data(); + auto *in_after_data = weight_list[i + 1]->template data(); + auto in_size = weight_list[i]->numel(); + bool temp = in_data + in_size == in_after_data; + continuous = continuous && temp; + } + return continuous; +} + +int size_sum(const std::vector &weight_list) { + int size = 0; + for (size_t i = 0; i < weight_list.size(); ++i) { + auto in_size = weight_list[i]->numel(); + size += in_size; + } + return size; +} + +template +void weight_to_tensor(const platform::Place &place, cudaStream_t stream, + const std::vector &weight_list, + Tensor *weight) { + auto weight_data = weight->data(); + int weight_offset = 0; + for (size_t i = 0; i < weight_list.size(); ++i) { + const T *in_data = weight_list[i]->data(); + auto in_size = weight_list[i]->numel(); + + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, weight->place()), + weight_data + weight_offset, + BOOST_GET_CONST(platform::CUDAPlace, weight_list[i]->place()), + in_data, in_size * sizeof(T), stream); + weight_offset += in_size; + } +} + +template +void weight_to_tensor_list(const platform::Place &place, cudaStream_t stream, + std::vector *weight_grad, + const std::vector &weight_input, + const Tensor *weight) { + int weight_offset = 0; + auto *weight_data = weight->data(); + for (size_t i = 0; i < weight_input.size(); ++i) { + auto in_size = weight_input[i]->numel(); + T *weight_grad_data = (*weight_grad)[i]->mutable_data(place); + const T *src = weight_data + weight_offset; + + memory::Copy( + BOOST_GET_CONST(platform::CUDAPlace, (*weight_grad)[i]->place()), + weight_grad_data, BOOST_GET_CONST(platform::CUDAPlace, weight->place()), + src, in_size * sizeof(T), stream); + weight_offset += in_size; + } +} + template void LSTMInferece(const bool &has_seq_length, const cudnnHandle_t &handle, const int &seq_length, ScopedRNNBase *rnn, const T *x_data, @@ -75,8 +135,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { const Tensor *init_h = ctx.Input("InitH"); const Tensor *init_c = ctx.Input("InitC"); - auto w = ctx.Input("W"); - Tensor *out = ctx.Output("Out"); Tensor *last_h = ctx.Output("LastH"); Tensor *last_c = ctx.Output("LastC"); @@ -87,8 +145,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { const T *init_h_data = init_h->data(); const T *init_c_data = init_c->data(); - const T *w_data = w->data(); - T *out_data = out->mutable_data(ctx.GetPlace()); T *last_h_data = last_h->mutable_data(ctx.GetPlace()); T *last_c_data = last_c->mutable_data(ctx.GetPlace()); @@ -113,11 +169,45 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { int seq_length = x->dims()[0]; int batch_size = x->dims()[1]; int input_size = x->dims()[2]; - int weight_numel = w->numel(); bool state_initialized = state_out->IsInitialized() ? true : false; size_t workspace_size; size_t reserve_size; + Tensor weight_whole; + T *w_data = nullptr; + int weight_numel; + bool w_initialized = false; + auto place = ctx.GetPlace(); + auto stream = reinterpret_cast( + ctx.device_context()) + .stream(); + if (is_test && ctx.HasInput("W")) { + auto *W = ctx.Input("W"); + w_initialized = W->IsInitialized() ? true : false; + weight_numel = W->numel(); + } + if (!w_initialized) { + auto weight_list = ctx.MultiInput("WeightList"); + bool continuous = + is_continuous>(weight_list); + weight_numel = size_sum(weight_list); + + if (!continuous) { + LOG_FIRST_N(WARNING, 2) + << "If the memory space of the Input WeightList is not " + "continuous, less efficient calculation will be " + "called. Please call coalesce_tensor op to make the " + "input memory continuous."; + weight_whole.mutable_data({weight_numel}, place); + weight_to_tensor(place, stream, weight_list, &weight_whole); + w_data = weight_whole.data(); + } else { + w_data = const_cast(weight_list[0]->data()); + } + } else { + auto *W = ctx.Input("W"); + w_data = const_cast(W->data()); + } ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size, num_layers, dropout_prob, seed, weight_numel, @@ -136,6 +226,12 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { LSTMInferece(has_seq_length, handle, seq_length, &rnn, x_data, init_h_data, init_c_data, w_data, out_data, last_h_data, last_c_data, &workspace_data_, workspace_size); + if (!w_initialized && ctx.HasInput("W") && ctx.HasInput("WeightList")) { + auto *W = const_cast(ctx.Input("W")); + auto weight_list = ctx.MultiInput("WeightList"); + W->mutable_data({weight_numel}, place); + weight_to_tensor(place, stream, weight_list, W); + } } else { if (!has_seq_length) { // for train @@ -176,11 +272,11 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { auto *input = ctx.Input("Input"); - auto *weight = ctx.Input("W"); auto *init_h = ctx.Input("InitH"); auto *init_c = ctx.Input("InitC"); auto *reserve = ctx.Input("Reserve"); auto *state_out = ctx.Input("StateOut"); + auto weight_list = ctx.MultiInput("WeightList"); auto *out = ctx.Input("Out"); auto *out_grad = ctx.Input(framework::GradVarName("Out")); @@ -188,9 +284,10 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { auto *last_c_grad = ctx.Input(framework::GradVarName("LastC")); auto *in_grad = ctx.Output(framework::GradVarName("Input")); - auto *weight_grad = ctx.Output(framework::GradVarName("W")); auto *init_h_grad = ctx.Output(framework::GradVarName("InitH")); auto *init_c_grad = ctx.Output(framework::GradVarName("InitC")); + auto weight_grad_list = ctx.MultiOutput( + framework::GradVarName("WeightList")); auto &dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); @@ -199,7 +296,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { auto init_h_dims = init_h->dims(); auto init_c_dims = init_c->dims(); - auto *weight_data = weight->data(); auto *init_h_data = init_h->data(); auto *init_c_data = init_c->data(); auto *out_data = out->data(); @@ -207,18 +303,50 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { auto *last_h_grad_data = last_h_grad->data(); auto *last_c_grad_data = last_c_grad->data(); + auto place = ctx.GetPlace(); + int weight_numel = size_sum(weight_list); + bool continuous = + is_continuous>(weight_list); + + auto stream = reinterpret_cast( + ctx.device_context()) + .stream(); + Tensor weight_whole; + T *weight_data = nullptr; + + if (!continuous) { + weight_whole.mutable_data({weight_numel}, place); + weight_to_tensor(place, stream, weight_list, &weight_whole); + weight_data = weight_whole.data(); + } else { + weight_data = const_cast(weight_list[0]->data()); + } + + Tensor weight_grad; math::SetConstant zero; - weight_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, weight_grad, static_cast(0.0)); + weight_grad.mutable_data({weight_numel}, ctx.GetPlace()); + zero(dev_ctx, &weight_grad, static_cast(0.0)); + T *weight_grad_data = weight_grad.data(); + + int offset = 0; + for (size_t i = 0; i < weight_grad_list.size(); ++i) { + size_t len = weight_grad_list[i]->numel(); + auto dim = weight_grad_list[i]->dims(); + weight_grad_list[i] + ->ShareDataWith(weight_grad.Slice(static_cast(offset), + static_cast(offset + len))) + .Resize(dim); + offset += len; + } in_grad->mutable_data(input_dims, ctx.GetPlace()); auto *in_grad_data = in_grad->data(); - init_h_grad->mutable_data(init_h_dims, ctx.GetPlace()); - auto *init_h_grad_data = init_h_grad->data(); + if (init_h_grad) init_h_grad->mutable_data(init_h_dims, ctx.GetPlace()); + auto *init_h_grad_data = init_h_grad ? init_h_grad->data() : nullptr; - init_c_grad->mutable_data(init_c_dims, ctx.GetPlace()); - auto *init_c_grad_data = init_c_grad->data(); + if (init_c_grad) init_c_grad->mutable_data(init_c_dims, ctx.GetPlace()); + auto *init_c_grad_data = init_c_grad ? init_c_grad->data() : nullptr; float dropout_prob = ctx.Attr("dropout_prob"); bool is_bidirec = ctx.Attr("is_bidirec"); @@ -236,7 +364,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { int seq_length = input_dims[0]; int batch_size = input->dims()[1]; int input_size = input->dims()[2]; - int weight_numel = weight->numel(); size_t workspace_size; size_t reserve_size; @@ -268,8 +395,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data(), rnn.init_h_desc(), init_h->data(), rnn.y_descs(), out->data(), workspace_data_.data(), workspace_size, rnn.weight_desc(), - weight_grad->data(), const_cast(reserve_data), - reserve_size)); + weight_grad_data, const_cast(reserve_data), reserve_size)); } else { #if CUDNN_VERSION >= 7201 // for train @@ -288,7 +414,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data(), rnn.init_h_desc(), init_h->data(), rnn.y_seq_desc(), out->data(), workspace_data_.data(), workspace_size, - rnn.weight_desc(), weight_grad->data(), + rnn.weight_desc(), weight_grad_data, const_cast(reserve_data), reserve_size)); #else PADDLE_THROW(platform::errors::Unavailable( diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu index 8d79626aa8785..80490af33a1f9 100644 --- a/paddle/fluid/operators/edit_distance_op.cu +++ b/paddle/fluid/operators/edit_distance_op.cu @@ -111,8 +111,9 @@ class EditDistanceGPUKernel : public framework::OpKernel { if (normalized) { for (size_t i = 1; i < ref_lod.size(); ++i) { - PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1], - "Reference string %d is empty.", i); + PADDLE_ENFORCE_GT(ref_lod[i], ref_lod[i - 1], + platform::errors::InvalidArgument( + "Reference string %d is empty.", i)); } } diff --git a/paddle/fluid/operators/edit_distance_op.h b/paddle/fluid/operators/edit_distance_op.h index 3e1aec7ceeec7..ef290c2eff2be 100644 --- a/paddle/fluid/operators/edit_distance_op.h +++ b/paddle/fluid/operators/edit_distance_op.h @@ -58,8 +58,9 @@ class EditDistanceKernel : public framework::OpKernel { if (normalized) { for (size_t i = 1; i < ref_lod.size(); ++i) { - PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1], - "Reference string %d is empty.", i); + PADDLE_ENFORCE_GT(ref_lod[i], ref_lod[i - 1], + platform::errors::InvalidArgument( + "Reference string %d is empty.", i)); } } auto num_strs = hyp_lod.size() - 1; @@ -106,10 +107,11 @@ class EditDistanceKernel : public framework::OpKernel { } if (normalized) { - PADDLE_ENFORCE(n > 0, - "The reference string (#%d) cannot be empty " - "when Attr(normalized) is enabled.", - n); + PADDLE_ENFORCE_GT(n, 0UL, + platform::errors::InvalidArgument( + "The reference string (#%d) cannot be empty " + "when Attr(normalized) is enabled.", + n)); distance = distance / n; } out[num] = distance; diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc index 870464efed2b1..25b83ed93f729 100644 --- a/paddle/fluid/operators/expand_as_op.cc +++ b/paddle/fluid/operators/expand_as_op.cc @@ -89,8 +89,9 @@ class ExpandAsGradOp : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true); - PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true); + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAs"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", + framework::GradVarName("Out"), "ExpandAs"); auto x_dims = ctx->GetInputDim("X"); auto x_grad_name = framework::GradVarName("X"); diff --git a/paddle/fluid/operators/expand_as_op.h b/paddle/fluid/operators/expand_as_op.h index b189aa6f12274..cbaeb0c4e4256 100644 --- a/paddle/fluid/operators/expand_as_op.h +++ b/paddle/fluid/operators/expand_as_op.h @@ -61,7 +61,10 @@ class ExpandAsKernel : public framework::OpKernel { switch (rank) { REP_EXPAND_AS_TEMPLATE(MAX_RANK_SUPPORTED) default: - PADDLE_THROW("Only support tensor with rank being between 1 and 6."); + PADDLE_THROW(platform::errors::InvalidArgument( + "Only support tensor with rank being between 1 and 6. But received " + "tensor X's rank = %d.", + rank)); } } @@ -77,13 +80,19 @@ class ExpandAsKernel : public framework::OpKernel { auto x_dims = in0->dims(); auto y_dims = target_tensor->dims(); for (int i = 0; i < y_dims.size(); ++i) { - PADDLE_ENFORCE_NE(x_dims[i], 0, "X(input) should not have 0 dim"); + PADDLE_ENFORCE_NE( + x_dims[i], 0UL, + platform::errors::InvalidArgument( + "X(input) should not have 0 dim. But received x_dims[%d] = 0.", + i)); bcast_dims[i] = y_dims[i] / x_dims[i]; bcast_dims_remainder += y_dims[i] % x_dims[i]; } - PADDLE_ENFORCE_EQ(bcast_dims_remainder, 0, - "X(input) could not be broadcast together with remapped " - "shape(expand tensor's shape)"); + PADDLE_ENFORCE_EQ( + bcast_dims_remainder, 0UL, + platform::errors::InvalidArgument( + "X(input) could not be broadcast together with remapped " + "shape(expand tensor's shape)")); framework::DDim out_dims(in_dims); for (size_t i = 0; i < bcast_dims.size(); ++i) { out_dims[i] *= bcast_dims[i]; @@ -137,7 +146,10 @@ class ExpandAsGradKernel : public framework::OpKernel { switch (dims) { REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) default: - PADDLE_THROW("Only support tensor with rank being between 1 and 6."); + PADDLE_THROW(platform::errors::InvalidArgument( + "Only support tensor with rank being between 1 and 6. But " + "received tensor's rank = %d.", + dims)); } } } @@ -149,12 +161,6 @@ class ExpandAsGradKernel : public framework::OpKernel { const std::vector& reduce_dims_vec) const { size_t reshape_size = reshape_dims_vec.size(); size_t reduce_size = reduce_dims_vec.size(); - PADDLE_ENFORCE_EQ(reshape_size, reshape_dims_vec.size(), - "Inconsistent size between template Dims and " - "reshape dimensions."); - PADDLE_ENFORCE_EQ(reduce_size, reduce_dims_vec.size(), - "Inconsistent size between template Dims and " - "reduce dimensions."); auto* in0 = context.Input(framework::GradVarName("Out")); auto* out0 = context.Output(framework::GradVarName("X")); out0->mutable_data(context.GetPlace()); diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc index e9b4c7dacf8b4..04fa8db9a5a6f 100644 --- a/paddle/fluid/operators/fake_quantize_op.cc +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -404,6 +404,10 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker "the received is %d", bit_length)); }); + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); AddComment(R"DOC( The scale of FakeChannelWiseQuantize operator is a vector. In detail, each channel of the input X has a scale value. diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h index 2f5afbe0eedf9..94a75f930beba 100644 --- a/paddle/fluid/operators/fake_quantize_op.h +++ b/paddle/fluid/operators/fake_quantize_op.h @@ -146,16 +146,19 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel { auto* out = context.Output("Out"); auto* out_scale = context.Output("OutScale"); - T* out_scale_data = out_scale->mutable_data(context.GetPlace()); out->mutable_data(context.GetPlace()); int bit_length = context.Attr("bit_length"); int bin_cnt = std::pow(2, bit_length - 1) - 1; int quant_axis = context.Attr("quant_axis"); + bool is_test = context.Attr("is_test"); auto& dev_ctx = context.template device_context(); - FindChannelAbsMaxFunctor()(dev_ctx, *in, quant_axis, - out_scale_data); + if (!is_test) { + T* out_scale_data = out_scale->mutable_data(context.GetPlace()); + FindChannelAbsMaxFunctor()(dev_ctx, *in, quant_axis, + out_scale_data); + } ChannelClipAndFakeQuantFunctor()( dev_ctx, *in, *out_scale, bin_cnt, quant_axis, out); } diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index 847b24f4f0b0b..d791b2bcfd09f 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -23,64 +23,80 @@ class FCOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, - "X(Input) of Fully Connected should not be null."); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - "Out(Output) of Fully Connected should not be null."); - PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true, - "W(Input) of Fully Connected should not be null."); + OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "FC"); + OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "FC"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FC"); - auto in_dims = ctx->GetInputDim("Input"); auto w_dims = ctx->GetInputDim("W"); bool padding_weights = ctx->Attrs().Get("padding_weights"); + PADDLE_ENFORCE_EQ( + w_dims.size(), 2, + platform::errors::InvalidArgument( + "The input Weight of fc is expected to be a 2-D tensor. " + "But received the number of Weight's dimensions is %d, " + "Weight's shape is %s.", + w_dims.size(), w_dims)); if (ctx->HasInput("Bias")) { auto bias_dims = ctx->GetInputDim("Bias"); auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1]; + + PADDLE_ENFORCE_LE( + bias_dims.size(), 2, + platform::errors::InvalidArgument( + "The input Bias of fc is expected to be a 1-D or 2-D tensor. But " + "received the number of Bias's dimensions is %d, " + "Bias's shape is %s.", + bias_dims.size(), bias_dims)); + + PADDLE_ENFORCE_EQ( + bias_dims[bias_dims.size() - 1], w_dims1, + platform::errors::InvalidArgument( + "The last dimension of input Bias is expected be equal " + "to the actual width of input Weight. But received the last " + "dimension of Bias is %d, Bias's shape is %s; " + "the actual width of Weight is %d, Weight's shape is %s.", + bias_dims[bias_dims.size() - 1], bias_dims, w_dims1, w_dims)); + if (bias_dims.size() == 2) { - PADDLE_ENFORCE_EQ(bias_dims[0], 1, - platform::errors::InvalidArgument( - "The shape of Bias is invalid." - "The height of Bias should be 1." - "But received height of Bias is %d.", - bias_dims[0])); - PADDLE_ENFORCE_EQ( - bias_dims[1], w_dims1, - platform::errors::InvalidArgument( - "The shape of Bias is invalid." - "The width of Bias should be equal to width of Weight." - "But received width of Bias is %d and width of Weight is %d.", - bias_dims[1], w_dims1)); - } else if (bias_dims.size() == 1) { PADDLE_ENFORCE_EQ( - bias_dims[0], w_dims1, + bias_dims[0], 1, platform::errors::InvalidArgument( - "The shape of Bias is invalid." - "The height of Bias should be equal to the width of weight." - "But received height of Bias is %d and width of Weight is %d.", - bias_dims[0], w_dims1)); + "The first dimension of input Bias is expected to be 1, " + "but received %d, Bias's shape is %s.", + bias_dims[0], bias_dims)); } } + auto in_dims = ctx->GetInputDim("Input"); + int in_num_col_dims = ctx->Attrs().Get("in_num_col_dims"); + PADDLE_ENFORCE_LT( + in_num_col_dims, in_dims.size(), + platform::errors::InvalidArgument( + "The attribute in_num_col_dims used to flatten Input to " + "a 2-D tensor, is expected to be less than the number of " + "Input's dimensions. But recieved in_num_col_dims is %d, " + "the number of Input's dimensions is %d, Input's shape is %s.", + in_num_col_dims, in_dims.size(), in_dims)); + auto& activation_type = ctx->Attrs().Get("activation_type"); if (!activation_type.empty()) { PADDLE_ENFORCE_EQ(activation_type, "relu", - "Activation %s is not supportetd in fc now.", - activation_type.c_str()); + platform::errors::InvalidArgument( + "The attribute activation_type of fc is expected " + "to be \"relu\", but received %s.", + activation_type.c_str())); } + if (ctx->Attrs().Get("use_mkldnn")) { PADDLE_ENFORCE_EQ( in_dims.size() >= 2 && in_dims.size() <= 4, true, platform::errors::Unimplemented( - "Fully Connected input should be 2D, 3D or 4D tensor.")); + "The Input of fc is expected to be a 2-D, 3-D or 4-D tensor when " + "use_mkldnn is set. But recieved the number of Input's " + "dimensions is %d, Input's shape is %s.", + in_dims.size(), in_dims)); } - PADDLE_ENFORCE_EQ(w_dims.size(), 2, - "Fully Connected weights should be 2-D tensor."); - int in_num_col_dims = ctx->Attrs().Get("in_num_col_dims"); - PADDLE_ENFORCE_GT( - in_dims.size(), in_num_col_dims, - "The input tensor Input's rank of FCOp should be larger than " - "in_num_col_dims."); std::vector output_dims; FCOutputSize(in_dims, w_dims, output_dims, in_num_col_dims, diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h index 907f61196d61b..6258dd0a3868f 100644 --- a/paddle/fluid/operators/fc_op.h +++ b/paddle/fluid/operators/fc_op.h @@ -32,11 +32,15 @@ inline void FCOutputSize(const framework::DDim& in_dims, auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims); auto w_dims0 = padding_weights ? w_dims[0] - 4 : w_dims[0]; auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1]; - PADDLE_ENFORCE_EQ(in_mat_dims[1], w_dims0, - platform::errors::InvalidArgument( - "Fully Connected input and weigth size do not match. " - "input width: %d,weight height: %d", - in_mat_dims[1], w_dims0)); + PADDLE_ENFORCE_EQ( + in_mat_dims[1], w_dims0, + platform::errors::InvalidArgument( + "The input's second dimension and weight's first dimension is " + "expected to be the same. But recieved input's second dimension is " + "%d, input's shape is %s; weight's first dimension is %d, weight's " + "shape is %s.", + in_mat_dims[1], in_mat_dims, w_dims0, + framework::make_ddim({w_dims0, w_dims1}))); out_dims.reserve(static_cast(in_num_col_dims + 1)); for (int i = 0; i < in_num_col_dims; ++i) { diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h index 2c0c5f9ec0afa..c61b9a9e48854 100644 --- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h +++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h @@ -276,7 +276,8 @@ static void RunFunctors(const framework::ExecutionContext &ctx, ctx, paddle::operators::math::MulFunctor(), paddle::operators::math::SigmoidFunctor(), in_x, in_y, outputs); } else { - PADDLE_THROW("%s has not been implemented.", funcs_str); + PADDLE_THROW(platform::errors::InvalidArgument( + "%s has not been implemented.", funcs_str)); } } @@ -374,7 +375,8 @@ static void RunGradFunctors( paddle::operators::math::SigmoidGradFunctor(), in_x, in_y, in_out, in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out); } else { - PADDLE_THROW("%s has not been implemented.", funcs_str); + PADDLE_THROW(platform::errors::InvalidArgument( + "%s has not been implemented.", funcs_str)); } } @@ -386,16 +388,21 @@ class FusedElemwiseActivationKernel : public framework::OpKernel { "X", "FusedElemwiseActivation"); auto &in_y = GET_DATA_SAFELY(ctx.Input("Y"), "Input", "Y", "FusedElemwiseActivation"); - PADDLE_ENFORCE(ctx.HasOutput("Out"), "The output(Out) should not be empty"); + + PADDLE_ENFORCE_EQ(ctx.HasOutput("Out"), true, + platform::errors::InvalidArgument( + "The output(Out) should not be empty")); auto output = ctx.Output("Out"); std::vector outputs; outputs.emplace_back(output); if (ctx.Attr("save_intermediate_out")) { - PADDLE_ENFORCE(ctx.HasOutput("IntermediateOut"), - "The save_intermediate_out is enable, so the " - "IntermediateOut should not be empty."); + PADDLE_ENFORCE_EQ(ctx.HasOutput("IntermediateOut"), true, + platform::errors::InvalidArgument( + "The save_intermediate_out is enable, so the " + "IntermediateOut should not be empty.")); + auto intermediate_out = ctx.Output("IntermediateOut"); outputs.emplace_back(intermediate_out); } else { @@ -411,13 +418,18 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { auto in_y = ctx.Input("Y"); - PADDLE_ENFORCE(in_y != nullptr, "Input(Y) should not be nullptr."); + PADDLE_ENFORCE_NE(in_y, nullptr, platform::errors::InvalidArgument( + "Input(Y) should not be nullptr.")); auto in_out = ctx.Input("Out"); - PADDLE_ENFORCE(in_out != nullptr, "Input(Out) should not be nullptr."); + PADDLE_ENFORCE_NE( + in_out, nullptr, + platform::errors::InvalidArgument("Input(Out) should not be nullptr.")); auto in_out_grad = ctx.Input(framework::GradVarName("Out")); - PADDLE_ENFORCE(in_out_grad != nullptr, - "Input(Out@Grad) should not be nullptr."); + PADDLE_ENFORCE_NE(in_out_grad, nullptr, + platform::errors::InvalidArgument( + "Input(Out@Grad) should not be nullptr.")); + framework::Tensor *in_x = const_cast(ctx.Input("X")); framework::Tensor *x_grad = @@ -437,24 +449,28 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel { // recompute. in_intermediate_out = const_cast( ctx.Input("IntermediateOut")); - PADDLE_ENFORCE(in_intermediate_out != nullptr, - "The option of 'save_intermediate_out' is opened, " - "so the number of 'Out' should be two."); + PADDLE_ENFORCE_NE(in_intermediate_out, nullptr, + platform::errors::InvalidArgument( + "The option of 'save_intermediate_out' is opened," + " so the number of 'Out' should be two.")); } else { if (!InputXCanBeAbsent(functor_list)) { - PADDLE_ENFORCE(in_x != nullptr, "Input(X) should not be null."); + PADDLE_ENFORCE_NE(in_x, nullptr, platform::errors::InvalidArgument( + "Input(X) should not be null.")); } } // Get in_x if (ctx.HasInput("X")) { - PADDLE_ENFORCE(in_x != nullptr, "Input(X) should not be nullptr."); + PADDLE_ENFORCE_NE(in_x, nullptr, platform::errors::InvalidArgument( + "Input(X) should not be null.")); } else { // If functor_list contains elementwise_add, the backward doesn't use // in_x, in_y and in_out. - PADDLE_ENFORCE(InputXCanBeAbsent(functor_list), - "Only when the compoundfunctor contains " - "elementwise_add_grad, the 'X' could be absent."); + PADDLE_ENFORCE_EQ(InputXCanBeAbsent(functor_list), true, + platform::errors::InvalidArgument( + "Only when the compoundfunctor contains " + "elementwise_add_grad, the 'X' could be absent.")); in_x = const_cast(in_out_grad); } diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index aeaec84ba5c94..8713d58034241 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -204,9 +204,9 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { auto *table_t = context.Input("W"); table_dim = table_t->value().dims(); } else { - PADDLE_THROW( + PADDLE_THROW(platform::errors::PermissionDenied( "The parameter W of a LookupTable " - "must be either LoDTensor or SelectedRows"); + "must be either LoDTensor or SelectedRows.")); } bool is_sparse = context.Attr("is_sparse"); diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc index ea7d6a93d1b28..08909bcb6fcb9 100644 --- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc +++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc @@ -22,47 +22,73 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - "Input(X) of fused_fc_elementwise_layernorm should not be null."); - PADDLE_ENFORCE_EQ( - ctx->HasInput("W"), true, - "Input(W) of fused_fc_elementwise_layernorm should not be null."); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Y"), true, - "Input(Y) of fused_fc_elementwise_layernorm should not be null."); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - "Output(Out) of fused_fc_elementwise_layernorm should not be null."); + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", + "FusedFcElementwiseLayernorm"); + OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", + "FusedFcElementwiseLayernorm"); + OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", + "FusedFcElementwiseLayernorm"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", + "FusedFcElementwiseLayernorm"); auto w_dims = ctx->GetInputDim("W"); - PADDLE_ENFORCE_EQ(w_dims.size(), 2, - "Fully Connected input should be 2-D tensor."); + PADDLE_ENFORCE_EQ( + w_dims.size(), 2, + platform::errors::InvalidArgument( + "The input Weight of fc is expected to be a 2-D tensor. " + "But received the number of Weight's dimensions is %d, ", + "Weight's shape is %s.", w_dims.size(), w_dims)); if (ctx->HasInput("Bias0")) { auto bias0_dims = ctx->GetInputDim("Bias0"); + + PADDLE_ENFORCE_LE(bias0_dims.size(), 2, + platform::errors::InvalidArgument( + "The input Bias of fc is expected to be an 1-D or " + "2-D tensor. But received the number of Bias's " + "dimensions is %d, Bias's shape is %s.", + bias0_dims.size(), bias0_dims)); + + PADDLE_ENFORCE_EQ( + bias0_dims[bias0_dims.size() - 1], w_dims[1], + platform::errors::InvalidArgument( + "The last dimension of input Bias is expected be equal " + "to the actual width of input Weight. But received the last " + "dimension of Bias is %d, Bias's shape is %s; " + "the actual width of Weight is %d, Weight's shape is %s.", + bias0_dims[bias0_dims.size() - 1], bias0_dims, w_dims[1], + w_dims)); + if (bias0_dims.size() == 2) { - PADDLE_ENFORCE_EQ(bias0_dims[0], 1, - "The shape of Bias must be [1, dim]."); - PADDLE_ENFORCE_EQ(bias0_dims[1], w_dims[1], - "The shape of Bias must be [1, dim]."); - } else if (bias0_dims.size() == 1) { - PADDLE_ENFORCE_EQ(bias0_dims[0], w_dims[1], - "The shape of Bias must be [1, dim]."); + PADDLE_ENFORCE_EQ( + bias0_dims[0], 1, + platform::errors::InvalidArgument( + "The first dimension of input Bias is expected to be 1, " + "but received %d, Bias's shape is %s.", + bias0_dims[0], bias0_dims)); } } auto x_dims = ctx->GetInputDim("X"); int x_num_col_dims = ctx->Attrs().Get("x_num_col_dims"); - PADDLE_ENFORCE_GT( - x_dims.size(), x_num_col_dims, - "The input tensor Input's rank of FCOp should be larger than " - "in_num_col_dims."); + PADDLE_ENFORCE_LT( + x_num_col_dims, x_dims.size(), + platform::errors::InvalidArgument( + "The attribute x_num_col_dims used to flatten input X to " + "a 2-D tensor, is expected to be less than the number of " + "input X's dimensions. But recieved x_num_col_dims is %d, " + "the number of input X's dimensions is %d, input X's shape is %s.", + x_num_col_dims, x_dims.size(), x_dims)); auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims); PADDLE_ENFORCE_EQ( x_mat_dims[1], w_dims[0], - "Fully Connected input and weigth size do not match. %s, %s"); + platform::errors::InvalidArgument( + "The input's second dimension and weight's first dimension is " + "expected to be the same. But recieved input's second dimension is " + "%d, input's shape is %s; weight's first dimension is %d, weight's " + "shape is %s.", + x_mat_dims[1], x_mat_dims, w_dims[0], w_dims)); std::vector fc_out_dims; for (int i = 0; i < x_num_col_dims; ++i) { @@ -71,29 +97,67 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel { fc_out_dims.push_back(w_dims[1]); auto y_dims = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_EQ(framework::make_ddim(fc_out_dims), y_dims); + PADDLE_ENFORCE_EQ(framework::make_ddim(fc_out_dims), y_dims, + platform::errors::InvalidArgument( + "The output's shape of fc is expected to be equal to " + "that of input Y. But recieved output's shape of fc " + "is %s, input Y's shape is %s.", + framework::make_ddim(fc_out_dims), y_dims)); auto begin_norm_axis = ctx->Attrs().Get("begin_norm_axis"); PADDLE_ENFORCE_LT( begin_norm_axis, y_dims.size(), - "'begin_norm_axis' must be less than the rank of Input(Y)."); + platform::errors::InvalidArgument( + "The attribute begin_norm_axis used to flatten input Y to a 2-D " + "tensor, is expected to be less than the number of input Y's " + "dimensions. But recieved begin_norm_axis is %d, the number of " + "input Y's dimensions is %d, input Y's shape is %s.", + begin_norm_axis, y_dims.size(), y_dims)); auto y_mat_dim = framework::flatten_to_2d(y_dims, begin_norm_axis); int64_t dim_0 = y_mat_dim[0]; int64_t dim_1 = y_mat_dim[1]; if (ctx->HasInput("Scale")) { - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1); + auto scale_dims = ctx->GetInputDim("Scale"); + PADDLE_ENFORCE_EQ(scale_dims.size(), 1, + platform::errors::InvalidArgument( + "The input Scale is expected to be an 1-D tensor. " + "But recieved the number of input Scale's " + "dimensions is %d, input Scale's shape is %s.", + scale_dims.size(), scale_dims)); if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], dim_1, - "scale should with right"); + PADDLE_ENFORCE_EQ( + scale_dims[0], dim_1, + platform::errors::InvalidArgument( + "The first dimension of input Scale is expected to be equal to " + "the second dimension of input Y after flattened. " + "But recieved the first dimension of input Scale is %d, input " + "Scale's shape is %s; the second dimension of flattened input " + "Y is %d, input Y's shape is %s, flattened axis is %d.", + scale_dims[0], scale_dims, dim_1, y_dims, begin_norm_axis)); } } if (ctx->HasInput("Bias1")) { - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias1").size(), 1); + auto bias1_dims = ctx->GetInputDim("Bias1"); + PADDLE_ENFORCE_EQ( + bias1_dims.size(), 1, + platform::errors::InvalidArgument( + "The input Bias1 is expected to be an 1-D tensor. " + "But recieved the number of input Bias1's dimension is %d, " + "input Bias1's shape is %s.", + bias1_dims.size(), bias1_dims)); + if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias1")[0], dim_1, - "bias should with right"); + PADDLE_ENFORCE_EQ( + bias1_dims[0], dim_1, + platform::errors::InvalidArgument( + "The first dimension of input Bias1 is expected to be equal to " + "the second dimension of input Y after flattened. " + "But recieved the first dimension of input Bias1 is %d, input " + "Bias1's shape is %s; the second dimension of flatten input " + "Y is %d, input Y's shape is %s, flattened axis is %d.", + bias1_dims[0], bias1_dims, dim_1, y_dims, begin_norm_axis)); } } diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc index 5fad1b116de64..e51d94e4b1e05 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc @@ -86,7 +86,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { // Weights for int8 kernel are of a type s8 const auto weights_dt = - is_INT8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::f32; + is_INT8 ? dnnl::memory::data_type::s8 : MKLDNNGetDataType(); // oneDNN RNN dimensions const int64_t D = 1; // Directions @@ -95,7 +95,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { // Create memory descriptors auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType(), - MKLDNNMemoryFormat::any); + MKLDNNMemoryFormat::ntc); auto weight_x_md = MKLDNNMemDesc({L, D, IC, G, OC}, weights_dt, MKLDNNMemoryFormat::any); auto weight_h_md = @@ -103,7 +103,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::ldgo); auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType(), - MKLDNNMemoryFormat::any); + MKLDNNMemoryFormat::ntc); auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::ldnc); @@ -226,6 +226,8 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT { } // TODO(grygielski) H0 is for now persistable + // TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does + // not support in yet) std::shared_ptr AcquireH0Memory(const Tensor* h0) { const std::string h0_key = memory_key_ + "@h0"; auto memory_p = @@ -397,14 +399,14 @@ template class FusionGRUMKLDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - const bool is_INT8 = std::is_same::value; + const bool is_bf16 = std::is_same::value; const bool force_fp32_output = ctx.Attr("force_fp32_output"); - // TODO(grygielski) Add option for bfloat - if (!is_INT8 || force_fp32_output) { + // BF16 does not support force output + if (!is_bf16 && force_fp32_output) { RunKernel(ctx); } else { - RunKernel(ctx); + RunKernel(ctx); } } @@ -495,4 +497,5 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_KERNEL(fusion_gru, MKLDNN, paddle::platform::CPUPlace, ops::FusionGRUMKLDNNKernel, + ops::FusionGRUMKLDNNKernel, ops::FusionGRUMKLDNNKernel); diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc index 03279a9b2c15b..1018adcd930a4 100644 --- a/paddle/fluid/operators/instance_norm_op.cc +++ b/paddle/fluid/operators/instance_norm_op.cc @@ -181,10 +181,22 @@ class InstanceNormKernel auto &dev_ctx = ctx.template device_context(); auto *place = dev_ctx.eigen_device(); + Eigen::DSizes shape(NxC, sample_size); +// Once eigen on Windows is updated, the if branch can be removed. +#ifndef EIGEN_HAS_INDEX_LIST Eigen::DSizes bcast(1, sample_size); Eigen::DSizes C_shape(C, 1); Eigen::DSizes NxC_shape(NxC, 1); - Eigen::DSizes shape(NxC, sample_size); + Eigen::DSizes rdims(1); +#else + Eigen::IndexList, int> bcast; + bcast.set(1, sample_size); + Eigen::IndexList> C_shape; + C_shape.set(0, C); + Eigen::IndexList> NxC_shape; + NxC_shape.set(0, NxC); + Eigen::IndexList> rdims; +#endif math::SetConstant set_constant; @@ -201,8 +213,6 @@ class InstanceNormKernel auto x_e = framework::EigenVector::Flatten(*x); auto x_arr = x_e.reshape(shape); - Eigen::DSizes rdims(1); - saved_mean_e.device(*place) = x_arr.mean(rdims); auto saved_variance_arr = (x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon; @@ -316,14 +326,25 @@ class InstanceNormGradKernel auto &dev_ctx = ctx.template device_context(); auto *place = dev_ctx.eigen_device(); + Eigen::DSizes rshape(NxC, sample_size); + Eigen::DSizes param_shape(N, C); + Eigen::DSizes shape(NxC, sample_size); +#ifndef EIGEN_HAS_INDEX_LIST Eigen::DSizes rdims(0); Eigen::DSizes mean_rdims(1); - Eigen::DSizes rshape(NxC, sample_size); Eigen::DSizes bcast(1, sample_size); Eigen::DSizes C_shape(C, 1); Eigen::DSizes NxC_shape(NxC, 1); - Eigen::DSizes param_shape(N, C); - Eigen::DSizes shape(NxC, sample_size); +#else + Eigen::IndexList> rdims; + Eigen::IndexList> mean_rdims; + Eigen::IndexList, int> bcast; + bcast.set(1, sample_size); + Eigen::IndexList> C_shape; + C_shape.set(0, C); + Eigen::IndexList> NxC_shape; + NxC_shape.set(0, NxC); +#endif math::SetConstant set_constant; diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h index 488cbc6d517fc..d4f3fc5d7a622 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.h +++ b/paddle/fluid/operators/linear_chain_crf_op.h @@ -27,9 +27,10 @@ static inline T NormalizeL1(T* x, size_t len) { // (This comment is from the old LinearChainCRFLayer.) // Right now, we just bet that sum won't be zero. If this really happens, we // will figure out what should be done then. - PADDLE_ENFORCE(sum, - "The unnormalized probabilities of all possible unfinished " - "sequences must be greater than 0."); + PADDLE_ENFORCE_GT( + sum, 0., platform::errors::InvalidArgument( + "The unnormalized probabilities of all possible unfinished " + "sequences must be greater than 0.")); T s = 1. / sum; for (size_t i = 0; i < len; ++i) x[i] *= s; return sum; @@ -84,13 +85,19 @@ class LinearChainCRFOpKernel : public framework::OpKernel { const Tensor* label_length = ctx.Input("Length"); length_data = label_length->data(); seq_num = label_length->numel(); - PADDLE_ENFORCE_EQ(seq_num, emission_dims[0], - "the size of Input(length) must be equal to " - "emission_dims[0]."); + PADDLE_ENFORCE_EQ( + seq_num, emission_dims[0], + platform::errors::InvalidArgument( + "the size of Input(length) must be equal to " + "emission_dims[0]. But input_size = %d, emission_dims[0] = %d.", + seq_num, emission_dims[0])); auto label_dims = label->dims(); - PADDLE_ENFORCE_EQ(seq_num, label_dims[0], - "the size of Input(length) must be equal to " - "label_dims[0]."); + PADDLE_ENFORCE_EQ( + seq_num, label_dims[0], + platform::errors::InvalidArgument( + "the size of Input(length) must be equal to " + "label_dims[0]. But input_size = %d, label_dims[0] = %d.", + seq_num, label_dims[0])); batch_size = emission_dims[0] * emission_dims[1]; tag_num = emission_dims[2]; @@ -102,7 +109,9 @@ class LinearChainCRFOpKernel : public framework::OpKernel { math::set_constant(ctx.device_context(), alpha, 0.0); } else { in_lod = ctx.Input("Label")->lod(); - PADDLE_ENFORCE_NE(in_lod.size(), 0, "Input(Label) must be a sequence."); + PADDLE_ENFORCE_NE(in_lod.size(), 0, + platform::errors::InvalidArgument( + "Input(Label) must be a sequence.")); seq_num = in_lod[0].size() - 1; batch_size = emission_dims[0]; tag_num = emission_dims[1]; @@ -204,7 +213,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel { const int64_t* lbl = label.data(); PADDLE_ENFORCE_LT( static_cast(*std::max_element(lbl, lbl + seq_length)), tag_num, - "An invalid tag label that execesses the largest tag number."); + platform::errors::InvalidArgument( + "An invalid tag label that execesses the largest tag number.")); // Calculate the nominator part, which depends on the label sequence. ll += w[lbl[0]] /*start transition*/ + x[lbl[0]] + @@ -254,7 +264,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { {emission_dims[0] * emission_dims[1], emission_dims[2]}); } else { in_lod = ctx.Input("Label")->lod(); - PADDLE_ENFORCE_NE(in_lod.size(), 0, "Input(Label) must be a sequence."); + PADDLE_ENFORCE_NE(in_lod.size(), 0, + platform::errors::InvalidArgument( + "Input(Label) must be a sequence.")); seq_num = static_cast(in_lod[0].size() - 1); } diff --git a/paddle/fluid/operators/load_op_xpu.cc b/paddle/fluid/operators/load_op_xpu.cc new file mode 100644 index 0000000000000..e56586552e498 --- /dev/null +++ b/paddle/fluid/operators/load_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/load_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_XPU_KERNEL( + load, ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel); + +#endif // PADDLE_WITH_XPU diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc index c325c0892ed81..917482589fcf3 100644 --- a/paddle/fluid/operators/lstm_unit_op.cc +++ b/paddle/fluid/operators/lstm_unit_op.cc @@ -23,23 +23,31 @@ class LstmUnitOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("C_prev"), - "Input(C_prev) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("C"), - "Output(C) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("H"), - "Output(H) of LSTM should not be null."); + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lstm_unit"); + OP_INOUT_CHECK(ctx->HasInput("C_prev"), "Input", "C_prev", "lstm_unit"); + OP_INOUT_CHECK(ctx->HasOutput("C"), "Output", "C", "lstm_unit"); + OP_INOUT_CHECK(ctx->HasOutput("H"), "Output", "H", "lstm_unit"); auto x_dims = ctx->GetInputDim("X"); auto c_prev_dims = ctx->GetInputDim("C_prev"); - PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); + PADDLE_ENFORCE_EQ( + x_dims.size(), 2, + platform::errors::InvalidArgument( + "Input(X)'s rank must be 2. Received %d instead.", x_dims.size())); if (ctx->IsRuntime()) { PADDLE_ENFORCE_EQ(x_dims[0], c_prev_dims[0], - "Batch size of inputs and states must be equal"); + platform::errors::InvalidArgument( + "Batch size of inputs and states must be equal, " + "but received %d (inputs)" + "vs %d (states).", + x_dims[0], c_prev_dims[0])); PADDLE_ENFORCE_EQ(x_dims[1], c_prev_dims[1] * 4, - "Dimension of FC should equal to prev state * 4"); + platform::errors::InvalidArgument( + "Dimension of FC should equal to prev state * 4, " + "but received %d (dimension of FC)" + "vs %d (prev state * 4).", + x_dims[1], c_prev_dims[1] * 4)); } int b_size = c_prev_dims[0]; // batch size @@ -85,10 +93,10 @@ class LstmUnitGradOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("C")), - "Input(C@GRAD) should not be null"); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("H")), - "Input(H@GRAD) should not be null"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("C")), "Input", + framework::GradVarName("C"), "lstm_unit"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("H")), "Input", + framework::GradVarName("H"), "lstm_unit"); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); ctx->SetOutputDim(framework::GradVarName("C_prev"), ctx->GetInputDim("C_prev")); diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu index 810b83cb535fe..3949a066e0868 100644 --- a/paddle/fluid/operators/lstm_unit_op.cu +++ b/paddle/fluid/operators/lstm_unit_op.cu @@ -93,8 +93,9 @@ template class LstmUnitOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use CUDAPlace."); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); auto* x_tensor = ctx.Input("X"); auto* c_prev_tensor = ctx.Input("C_prev"); @@ -124,8 +125,9 @@ template class LstmUnitGradOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use CUDAPlace."); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); auto x_tensor = ctx.Input("X"); auto c_prev_tensor = ctx.Input("C_prev"); diff --git a/paddle/fluid/operators/lstm_unit_op.h b/paddle/fluid/operators/lstm_unit_op.h index 3fe7bda39b68d..99ae654d7ef0c 100644 --- a/paddle/fluid/operators/lstm_unit_op.h +++ b/paddle/fluid/operators/lstm_unit_op.h @@ -39,8 +39,9 @@ template class LstmUnitKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), - "It must use CPUPlace."); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet("It must use CPUPlace.")); auto* x_tensor = ctx.Input("X"); auto* c_prev_tensor = ctx.Input("C_prev"); @@ -82,8 +83,9 @@ template class LstmUnitGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), - "It must use CPUPlace."); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet("It must use CPUPlace.")); auto x_tensor = ctx.Input("X"); auto c_prev_tensor = ctx.Input("C_prev"); diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h index f0a727f34fec7..a2d1d5295be82 100644 --- a/paddle/fluid/operators/lstmp_op.h +++ b/paddle/fluid/operators/lstmp_op.h @@ -91,7 +91,8 @@ class LSTMPKernel : public framework::OpKernel { else if (act_type == math::detail::ActivationType::kReLU) ReluFunctor()(d, x, y); else - PADDLE_THROW("unsupported activation type"); + PADDLE_THROW( + platform::errors::InvalidArgument("unsupported activation type")); } void Compute(const framework::ExecutionContext& ctx) const override { @@ -263,7 +264,8 @@ class LSTMPGradKernel : public framework::OpKernel { else if (act_type == math::detail::ActivationType::kReLU) ReluGradFunctor()(d, x, y, dy, dx); else - PADDLE_THROW("unsupported activation type"); + PADDLE_THROW( + platform::errors::InvalidArgument("unsupported activation type")); } void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc index 40cea7483f397..fec738378a64c 100644 --- a/paddle/fluid/operators/math/pooling.cc +++ b/paddle/fluid/operators/math/pooling.cc @@ -60,19 +60,25 @@ class Pool2dFunctor { if (adaptive) { hstart = AdaptStartIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { + int pool_size = 1; if (adaptive) { wstart = AdaptStartIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width); } else { + hstart = ph * stride_height - padding_height; wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); + hend = std::min(hstart + ksize_height, + input_height + padding_height); + wend = + std::min(wstart + ksize_width, input_width + padding_width); + pool_size = (hend - hstart) * (wend - wstart); + wstart = std::max(wstart, 0); + hstart = std::max(hstart, 0); + hend = std::min(hend, input_height); + wend = std::min(wend, input_width); } T ele = pool_process.initial(); @@ -81,9 +87,10 @@ class Pool2dFunctor { pool_process.compute(input_data[h * input_width + w], &ele); } } - int pool_size = (exclusive || adaptive) - ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; + if (exclusive || adaptive) { + pool_size = (hend - hstart) * (wend - wstart); + } + pool_process.finalize(static_cast(pool_size), &ele); output_data[ph * output_width + pw] = ele; } @@ -137,19 +144,25 @@ class Pool2dFunctor { if (adaptive) { hstart = AdaptStartIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { + int pool_size = 1; if (adaptive) { wstart = AdaptStartIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width); } else { + hstart = ph * stride_height - padding_height; wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); + hend = std::min(hstart + ksize_height, + input_height + padding_height); + wend = + std::min(wstart + ksize_width, input_width + padding_width); + pool_size = (hend - hstart) * (wend - wstart); + wstart = std::max(wstart, 0); + hstart = std::max(hstart, 0); + hend = std::min(hend, input_height); + wend = std::min(wend, input_width); } T ele = pool_process.initial(); @@ -158,9 +171,9 @@ class Pool2dFunctor { pool_process.compute(input_data[h * input_width + w], &ele); } } - int pool_size = (exclusive || adaptive) - ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; + if (exclusive || adaptive) { + pool_size = (hend - hstart) * (wend - wstart); + } pool_process.finalize(static_cast(pool_size), &ele); output_data[ph * output_width + pw] = ele; } @@ -178,19 +191,25 @@ class Pool2dFunctor { if (adaptive) { hstart = AdaptStartIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { + int pool_size = 1; if (adaptive) { wstart = AdaptStartIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width); } else { + hstart = ph * stride_height - padding_height; wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); + hend = std::min(hstart + ksize_height, + input_height + padding_height); + wend = + std::min(wstart + ksize_width, input_width + padding_width); + pool_size = (hend - hstart) * (wend - wstart); + wstart = std::max(wstart, 0); + hstart = std::max(hstart, 0); + hend = std::min(hend, input_height); + wend = std::min(wend, input_width); } T ele = pool_process.initial(); for (int h = hstart; h < hend; ++h) { @@ -201,10 +220,9 @@ class Pool2dFunctor { &ele); } } - int pool_size = (exclusive || adaptive) - ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; - + if (exclusive || adaptive) { + pool_size = (hend - hstart) * (wend - wstart); + } pool_process.finalize(static_cast(pool_size), &ele); output_data[ph * output_width * output_channels + pw * output_channels + c] = ele; @@ -262,23 +280,29 @@ class Pool2dGradFunctor { if (adaptive) { hstart = AdaptStartIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { + int pool_size = 1; if (adaptive) { wstart = AdaptStartIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width); } else { + hstart = ph * stride_height - padding_height; wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); + hend = std::min(hstart + ksize_height, + input_height + padding_height); + wend = + std::min(wstart + ksize_width, input_width + padding_width); + pool_size = (hend - hstart) * (wend - wstart); + wstart = std::max(wstart, 0); + hstart = std::max(hstart, 0); + hend = std::min(hend, input_height); + wend = std::min(wend, input_width); + } + if (exclusive || adaptive) { + pool_size = (hend - hstart) * (wend - wstart); } - int pool_size = (exclusive || adaptive) - ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; float scale = 1.0 / pool_size; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { @@ -346,23 +370,29 @@ class Pool2dGradFunctor { if (adaptive) { hstart = AdaptStartIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { + int pool_size = 1; if (adaptive) { wstart = AdaptStartIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width); } else { + hstart = ph * stride_height - padding_height; wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); + hend = std::min(hstart + ksize_height, + input_height + padding_height); + wend = + std::min(wstart + ksize_width, input_width + padding_width); + pool_size = (hend - hstart) * (wend - wstart); + wstart = std::max(wstart, 0); + hstart = std::max(hstart, 0); + hend = std::min(hend, input_height); + wend = std::min(wend, input_width); + } + if (exclusive || adaptive) { + pool_size = (hend - hstart) * (wend - wstart); } - int pool_size = (exclusive || adaptive) - ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; float scale = 1.0 / pool_size; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { @@ -391,23 +421,29 @@ class Pool2dGradFunctor { if (adaptive) { hstart = AdaptStartIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); } for (int pw = 0; pw < output_width; ++pw) { + int pool_size = 1; if (adaptive) { wstart = AdaptStartIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width); } else { + hstart = ph * stride_height - padding_height; wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); + hend = std::min(hstart + ksize_height, + input_height + padding_height); + wend = + std::min(wstart + ksize_width, input_width + padding_width); + pool_size = (hend - hstart) * (wend - wstart); + wstart = std::max(wstart, 0); + hstart = std::max(hstart, 0); + hend = std::min(hend, input_height); + wend = std::min(wend, input_width); + } + if (exclusive || adaptive) { + pool_size = (hend - hstart) * (wend - wstart); } - int pool_size = (exclusive || adaptive) - ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; float scale = 1.0 / pool_size; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { @@ -672,34 +708,43 @@ class Pool3dFunctor { int dstart, dend; int hstart, hend; int wstart, wend; + for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { if (adaptive) { dstart = AdaptStartIndex(pd, input_depth, output_depth); dend = AdaptEndIndex(pd, input_depth, output_depth); - } else { - dstart = pd * stride_depth - padding_depth; - dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); } + for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { hstart = AdaptStartIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); } + for (int pw = 0; pw < output_width; ++pw) { + int pool_size = 1; if (adaptive) { wstart = AdaptStartIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width); } else { + dstart = pd * stride_depth - padding_depth; + dend = + std::min(dstart + ksize_depth, input_depth + padding_depth); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, + input_height + padding_height); wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); + wend = + std::min(wstart + ksize_width, input_width + padding_width); + pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = std::max(dstart, 0); + hstart = std::max(hstart, 0); wstart = std::max(wstart, 0); + dend = std::min(dend, input_depth); + hend = std::min(hend, input_height); + wend = std::min(wend, input_width); } int output_idx = (pd * output_height + ph) * output_width + pw; T ele = pool_process.initial(); @@ -712,10 +757,9 @@ class Pool3dFunctor { } } } - int pool_size = - (exclusive || adaptive) - ? (dend - dstart) * (hend - hstart) * (wend - wstart) - : ksize_depth * ksize_height * ksize_width; + if (exclusive || adaptive) { + pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + } pool_process.finalize(static_cast(pool_size), &ele); output_data[output_idx] = ele; } @@ -767,7 +811,6 @@ class Pool3dFunctor { int dstart, dend; int hstart, hend; int wstart, wend; - if (!channel_last) { const int input_stride = input_depth * input_height * input_width; const int output_stride = output_depth * output_height * output_width; @@ -777,29 +820,40 @@ class Pool3dFunctor { if (adaptive) { dstart = AdaptStartIndex(pd, input_depth, output_depth); dend = AdaptEndIndex(pd, input_depth, output_depth); - } else { - dstart = pd * stride_depth - padding_depth; - dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); } + for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { hstart = AdaptStartIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); } + for (int pw = 0; pw < output_width; ++pw) { + int pool_size = 1; if (adaptive) { wstart = AdaptStartIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width); } else { + dstart = pd * stride_depth - padding_depth; + dend = std::min(dstart + ksize_depth, + input_depth + padding_depth); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, + input_height + padding_height); wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); + wend = std::min(wstart + ksize_width, + input_width + padding_width); + + pool_size = + (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = std::max(dstart, 0); + hstart = std::max(hstart, 0); wstart = std::max(wstart, 0); + dend = std::min(dend, input_depth); + hend = std::min(hend, input_height); + wend = std::min(wend, input_width); } + int output_idx = (pd * output_height + ph) * output_width + pw; T ele = pool_process.initial(); for (int d = dstart; d < dend; ++d) { @@ -811,10 +865,10 @@ class Pool3dFunctor { } } } - int pool_size = - (exclusive || adaptive) - ? (dend - dstart) * (hend - hstart) * (wend - wstart) - : ksize_depth * ksize_height * ksize_width; + if (exclusive || adaptive) { + pool_size = + (dend - dstart) * (hend - hstart) * (wend - wstart); + } pool_process.finalize(static_cast(pool_size), &ele); output_data[output_idx] = ele; } @@ -835,28 +889,38 @@ class Pool3dFunctor { if (adaptive) { dstart = AdaptStartIndex(pd, input_depth, output_depth); dend = AdaptEndIndex(pd, input_depth, output_depth); - } else { - dstart = pd * stride_depth - padding_depth; - dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); } + for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { hstart = AdaptStartIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); } + for (int pw = 0; pw < output_width; ++pw) { + int pool_size = 1; if (adaptive) { wstart = AdaptStartIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width); } else { + dstart = pd * stride_depth - padding_depth; + dend = std::min(dstart + ksize_depth, + input_depth + padding_depth); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, + input_height + padding_height); wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); + wend = std::min(wstart + ksize_width, + input_width + padding_width); + + pool_size = + (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = std::max(dstart, 0); + hstart = std::max(hstart, 0); wstart = std::max(wstart, 0); + dend = std::min(dend, input_depth); + hend = std::min(hend, input_height); + wend = std::min(wend, input_width); } T ele = pool_process.initial(); @@ -871,10 +935,10 @@ class Pool3dFunctor { } } } - int pool_size = - (exclusive || adaptive) - ? (dend - dstart) * (hend - hstart) * (wend - wstart) - : ksize_depth * ksize_height * ksize_width; + if (exclusive || adaptive) { + pool_size = + (dend - dstart) * (hend - hstart) * (wend - wstart); + } pool_process.finalize(static_cast(pool_size), &ele); int output_idx = ((pd * output_height + ph) * output_width + pw) * @@ -943,34 +1007,42 @@ class Pool3dGradFunctor { if (adaptive) { dstart = AdaptStartIndex(pd, input_depth, output_depth); dend = AdaptEndIndex(pd, input_depth, output_depth); - } else { - dstart = pd * stride_depth - padding_depth; - dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); } + for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { hstart = AdaptStartIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); } + for (int pw = 0; pw < output_width; ++pw) { + int pool_size = 1; if (adaptive) { wstart = AdaptStartIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width); } else { + dstart = pd * stride_depth - padding_depth; + dend = + std::min(dstart + ksize_depth, input_depth + padding_depth); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, + input_height + padding_height); wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); + wend = + std::min(wstart + ksize_width, input_width + padding_width); + + pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = std::max(dstart, 0); + hstart = std::max(hstart, 0); wstart = std::max(wstart, 0); + dend = std::min(dend, input_depth); + hend = std::min(hend, input_height); + wend = std::min(wend, input_width); } - int pool_size = - (exclusive || adaptive) - ? (dend - dstart) * (hend - hstart) * (wend - wstart) - : ksize_depth * ksize_height * ksize_width; + if (exclusive || adaptive) { + pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + } float scale = 1.0 / pool_size; for (int d = dstart; d < dend; ++d) { for (int h = hstart; h < hend; ++h) { @@ -1046,34 +1118,44 @@ class Pool3dGradFunctor { if (adaptive) { dstart = AdaptStartIndex(pd, input_depth, output_depth); dend = AdaptEndIndex(pd, input_depth, output_depth); - } else { - dstart = pd * stride_depth - padding_depth; - dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); } + for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { hstart = AdaptStartIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); } + for (int pw = 0; pw < output_width; ++pw) { + int pool_size = 1; if (adaptive) { wstart = AdaptStartIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width); } else { + dstart = pd * stride_depth - padding_depth; + dend = std::min(dstart + ksize_depth, + input_depth + padding_depth); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, + input_height + padding_height); wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); + wend = std::min(wstart + ksize_width, + input_width + padding_width); + + pool_size = + (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = std::max(dstart, 0); + hstart = std::max(hstart, 0); wstart = std::max(wstart, 0); + dend = std::min(dend, input_depth); + hend = std::min(hend, input_height); + wend = std::min(wend, input_width); } - int pool_size = - (exclusive || adaptive) - ? (dend - dstart) * (hend - hstart) * (wend - wstart) - : ksize_depth * ksize_height * ksize_width; + if (exclusive || adaptive) { + pool_size = + (dend - dstart) * (hend - hstart) * (wend - wstart); + } float scale = 1.0 / pool_size; for (int d = dstart; d < dend; ++d) { for (int h = hstart; h < hend; ++h) { @@ -1108,34 +1190,44 @@ class Pool3dGradFunctor { if (adaptive) { dstart = AdaptStartIndex(pd, input_depth, output_depth); dend = AdaptEndIndex(pd, input_depth, output_depth); - } else { - dstart = pd * stride_depth - padding_depth; - dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); } + for (int ph = 0; ph < output_height; ++ph) { if (adaptive) { hstart = AdaptStartIndex(ph, input_height, output_height); hend = AdaptEndIndex(ph, input_height, output_height); - } else { - hstart = ph * stride_height - padding_height; - hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); } + for (int pw = 0; pw < output_width; ++pw) { + int pool_size = 1; if (adaptive) { wstart = AdaptStartIndex(pw, input_width, output_width); wend = AdaptEndIndex(pw, input_width, output_width); } else { + dstart = pd * stride_depth - padding_depth; + dend = std::min(dstart + ksize_depth, + input_depth + padding_depth); + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, + input_height + padding_height); wstart = pw * stride_width - padding_width; - wend = std::min(wstart + ksize_width, input_width); + wend = std::min(wstart + ksize_width, + input_width + padding_width); + + pool_size = + (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = std::max(dstart, 0); + hstart = std::max(hstart, 0); wstart = std::max(wstart, 0); + dend = std::min(dend, input_depth); + hend = std::min(hend, input_height); + wend = std::min(wend, input_width); } - int pool_size = - (exclusive || adaptive) - ? (dend - dstart) * (hend - hstart) * (wend - wstart) - : ksize_depth * ksize_height * ksize_width; + if (exclusive || adaptive) { + pool_size = + (dend - dstart) * (hend - hstart) * (wend - wstart); + } float scale = 1.0 / pool_size; for (int d = dstart; d < dend; ++d) { for (int h = hstart; h < hend; ++h) { diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 809164df2056c..129298edafcf9 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -348,6 +348,181 @@ framework::DDim GetDimForInput(const framework::InferShapeContext &ctx, return dim; } +template +class MatMulDoubleGradKernel : public framework::OpKernel { + public: + void MatMul(const framework::ExecutionContext &context, + const framework::Tensor &a, bool trans_a, + const framework::Tensor &b, bool trans_b, bool flag, + framework::Tensor *out) const { + out->mutable_data(context.GetPlace()); + auto blas = math::GetBlas(context); + auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a); + auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b); + + int head_number = 1; +#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) + head_number = context.Attr("head_number"); +#endif + + if (head_number <= 1 && a.dims().size() == 3 && b.dims().size() <= 2) { + // the transpose_X must be false, if is true, the transpose cost much time + if (!trans_a) { + mat_dim_a.height_ *= mat_dim_a.batch_size_; + mat_dim_a.batch_size_ = 0; + } + } + blas.MatMul(a, mat_dim_a, b, mat_dim_b, + static_cast(context.Attr("alpha")), out, + static_cast(flag)); + } + + void CalcInputGrad(const framework::ExecutionContext &context, + const framework::Tensor &a, bool trans_a, + bool is_fold_init_dims_a, const framework::Tensor &b, + bool trans_b, bool is_fold_init_dims_b, bool flag, + framework::Tensor *out) const { + if (out == nullptr) return; + bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) && + out->dims().size() == 2; + if (!need_combine) { + MatMul(context, a, trans_a, b, trans_b, flag, out); + } else { + auto &ctx = context.template device_context(); + MatMul(context, is_fold_init_dims_a + ? FoldInitDims(a) + : FoldHeadAndLastDims(ctx, a), + trans_a, is_fold_init_dims_b + ? FoldInitDims(b) + : FoldHeadAndLastDims(ctx, b), + trans_b, flag, out); + } + } + + void Compute(const framework::ExecutionContext &context) const override { + auto x = *context.Input("X"); + auto y = *context.Input("Y"); + auto dout = *context.Input("DOut"); + auto *ddx = context.Input("DDX"); + auto *ddy = context.Input("DDY"); + + auto *dx = context.Output("DX"); + auto *dy = context.Output("DY"); + auto *ddout = context.Output("DDOut"); + + bool transpose_x = context.Attr("transpose_X"); + bool transpose_y = context.Attr("transpose_Y"); + + ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); + + framework::DDim dx_dims; + if (dx) { + dx_dims = dx->dims(); + if (dx_dims != x.dims()) { + dx->Resize(x.dims()); + } + } + + framework::DDim dy_dims; + if (dy) { + dy_dims = dy->dims(); + if (dy_dims != y.dims()) { + dy->Resize(y.dims()); + } + } + + framework::DDim ddout_dims; + if (ddout) { + ddout_dims = ddout->dims(); + if (ddout_dims != dout.dims()) { + ddout->Resize(dout.dims()); + } + } + + bool ddout_flag = false; + if (ddx) { + auto ddx_mat = *ddx; + if (ddx_mat.dims() != x.dims()) { + ddx_mat.Resize(x.dims()); + } + if (dy) { + if (transpose_x && transpose_y) { + // dy = dout' * ddx' + CalcInputGrad(context, dout, true, true, ddx_mat, true, false, false, + dy); + } else if (transpose_x) { + // dy = ddx * dout + CalcInputGrad(context, ddx_mat, false, false, dout, false, true, + false, dy); + } else if (transpose_y) { + // dy = dout' * ddx + CalcInputGrad(context, dout, true, true, ddx_mat, false, true, false, + dy); + } else { + // dy = ddx' * dout + CalcInputGrad(context, ddx_mat, true, true, dout, false, true, false, + dy); + } + } + + if (ddout) { + CalcInputGrad(context, ddx_mat, transpose_x, true, y, transpose_y, + false, ddout_flag, ddout); + ddout_flag = true; + } + } + + if (ddy) { + auto ddy_mat = *ddy; + if (ddy_mat.dims() != y.dims()) { + ddy_mat.Resize(y.dims()); + } + if (dx) { + if (transpose_x && transpose_y) { + // dx = ddy' * dout' + CalcInputGrad(context, ddy_mat, true, true, dout, true, false, false, + dx); + } else if (transpose_x) { + // dx = ddy * dout' + CalcInputGrad(context, ddy_mat, false, false, dout, true, false, + false, dx); + } else if (transpose_y) { + // dx = dout * ddy + CalcInputGrad(context, dout, false, false, ddy_mat, false, true, + false, dx); + } else { + // dx = dout * ddy' + CalcInputGrad(context, dout, false, false, ddy_mat, true, false, + false, dx); + } + } + + if (ddout) { + CalcInputGrad(context, x, transpose_x, true, ddy_mat, transpose_y, + false, ddout_flag, ddout); + } + } + + if (dx) { + if (dx_dims != x.dims()) { + dx->Resize(dx_dims); + } + } + + if (dy) { + if (dy_dims != y.dims()) { + dy->Resize(dy_dims); + } + } + + if (ddout) { + if (ddout_dims != dout.dims()) { + ddout->Resize(ddout_dims); + } + } + } +}; + class MatMulOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -647,6 +822,61 @@ class MatMulOpGradMaker : public framework::SingleGradOpMaker { retv->SetAttrMap(this->Attrs()); } }; + +class MatMulOpDoubleGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *context) const override { + OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "matmul"); + OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "matmul"); + OP_INOUT_CHECK(context->HasInput("DOut"), "Input", "DOut", "matmul"); + + if (context->HasOutput("DX") && context->HasInput("DDY")) { + context->ShareDim("X", "DX"); + } + + if (context->HasOutput("DY") && context->HasInput("DDX")) { + context->ShareDim("Y", "DY"); + } + + if (context->HasOutput("DDOut") && + (context->HasInput("DDY") || context->HasInput("DDX"))) { + context->ShareDim("DOut", "DDOut"); + } + } +}; + +template +class MatMulOpDoubleGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr retv) const override { + retv->SetType("matmul_grad_grad"); + retv->SetInput("X", this->Input("X")); + retv->SetInput("Y", this->Input("Y")); + retv->SetInput("DOut", this->Input(framework::GradVarName("Out"))); + retv->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); + retv->SetInput("DDY", this->OutputGrad(framework::GradVarName("Y"))); + + auto ddx = this->OutputGrad(framework::GradVarName("X")); + auto ddy = this->OutputGrad(framework::GradVarName("Y")); + + if (!ddx.empty() || !ddy.empty()) { + retv->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out"))); + } + retv->SetOutput( + "DX", ddy.empty() ? this->EmptyInputGrad() : this->InputGrad("X")); + retv->SetOutput( + "DY", ddx.empty() ? this->EmptyInputGrad() : this->InputGrad("Y")); + + retv->SetAttrMap(this->Attrs()); + } +}; + } // namespace operators } // namespace paddle @@ -654,7 +884,10 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(matmul, ops::MatMulOp, ops::MatMulOpMaker, ops::MatMulOpGradMaker, ops::MatMulOpGradMaker); -REGISTER_OPERATOR(matmul_grad, ops::MatMulOpGrad); +REGISTER_OPERATOR(matmul_grad, ops::MatMulOpGrad, + ops::MatMulOpDoubleGradMaker, + ops::MatMulOpDoubleGradMaker); +REGISTER_OPERATOR(matmul_grad_grad, ops::MatMulOpDoubleGrad); REGISTER_OP_CPU_KERNEL( matmul, ops::MatMulKernel, ops::MatMulKernel); @@ -663,6 +896,11 @@ REGISTER_OP_CPU_KERNEL( ops::MatMulGradKernel, ops::MatMulGradKernel); +REGISTER_OP_CPU_KERNEL( + matmul_grad_grad, + ops::MatMulDoubleGradKernel, + ops::MatMulDoubleGradKernel); + #ifdef PADDLE_WITH_CUDA REGISTER_OP_CUDA_KERNEL( matmul, ops::MatMulKernel, @@ -675,4 +913,8 @@ REGISTER_OP_CUDA_KERNEL( ops::MatMulGradKernel, ops::MatMulGradKernel); +REGISTER_OP_CUDA_KERNEL( + matmul_grad_grad, + ops::MatMulDoubleGradKernel, + ops::MatMulDoubleGradKernel); #endif diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc index 7db2e9421b5ca..6d8d18a3d126e 100644 --- a/paddle/fluid/operators/maxout_op.cc +++ b/paddle/fluid/operators/maxout_op.cc @@ -83,6 +83,18 @@ class MaxOutOp : public framework::OperatorWithKernel { "Attr(groups) of Op(maxout) should be " "larger than 1. But received %d.", groups)); + PADDLE_ENFORCE_EQ( + axis == 1 || axis == -1 || axis == 3, true, + platform::errors::InvalidArgument( + "axis only supported 1, -1 or 3, but recevied axis is: %d", axis)); + PADDLE_ENFORCE_EQ(in_x_dims.size(), 4, + platform::errors::InvalidArgument( + "x's dims should be 4, but received x's dims is: %d", + in_x_dims.size())); + + if (axis < 0) { + axis += in_x_dims.size(); + } PADDLE_ENFORCE_EQ( in_x_dims[axis] % groups, 0, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h index ec3897e4044ad..64b538fc5d5bd 100644 --- a/paddle/fluid/operators/maxout_op.h +++ b/paddle/fluid/operators/maxout_op.h @@ -31,6 +31,9 @@ class MaxOutKernel : public framework::OpKernel { Tensor* out = context.Output("Out"); int groups = context.template Attr("groups"); int axis = context.template Attr("axis"); + if (axis < 0) { + axis += in_x->dims().size(); + } math::MaxOutFunctor maxout_forward; maxout_forward(context.template device_context(), *in_x, out, @@ -49,6 +52,10 @@ class MaxOutGradKernel : public framework::OpKernel { Tensor* in_x_grad = context.Output(framework::GradVarName("X")); int groups = context.template Attr("groups"); int axis = context.template Attr("axis"); + if (axis < 0) { + axis += in_x->dims().size(); + } + auto& device_ctx = context.template device_context(); math::SetConstant zero; if (in_x_grad) { diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc new file mode 100644 index 0000000000000..c0aa00e79341e --- /dev/null +++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc @@ -0,0 +1,120 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/platform/xpu_header.h" + +namespace paddle { +namespace operators { + +template +class AccuracyXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* inference = ctx.Input("Out"); + auto* indices = ctx.Input("Indices"); + auto* label = ctx.Input("Label"); + auto* accuracy = ctx.Output("Accuracy"); + auto* correct = ctx.Output("Correct"); + auto* total = ctx.Output("Total"); + int* correct_data = correct->mutable_data(ctx.GetPlace()); + int* total_data = total->mutable_data(ctx.GetPlace()); + float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); + const int64_t* indices_data = indices->data(); + const int64_t* label_data = label->data(); + size_t num_samples = inference->dims()[0]; + size_t class_dim = inference->dims()[1]; + if (num_samples == 0) { + return; + } + size_t indices_int32_size = num_samples * class_dim * sizeof(int); + size_t indices_int64_size = num_samples * class_dim * sizeof(int64_t); + size_t label_int32_size = num_samples * sizeof(int); + size_t label_int64_size = num_samples * sizeof(int64_t); + auto& dev_ctx = ctx.template device_context(); + int* indices_int32_device = NULL; + PADDLE_ENFORCE_EQ( + xpu_malloc(reinterpret_cast(&indices_int32_device), + indices_int32_size), + XPU_SUCCESS, + platform::errors::ResourceExhausted( + "\n\nOut of memory error on XPU, Cannot allocate %s memory" + " on XPU. \n\nPlease check whether there is any other process " + "using XPU.\n", + string::HumanReadableSize(indices_int32_size))); + int* label_int32_device = NULL; + PADDLE_ENFORCE_EQ( + xpu_malloc(reinterpret_cast(&label_int32_device), + label_int32_size), + XPU_SUCCESS, + platform::errors::ResourceExhausted( + "\n\nOut of memory error on XPU, Cannot allocate %s memory" + " on XPU. \n\nPlease check whether there is any other process " + "using XPU.\n", + string::HumanReadableSize(label_int32_size))); + + int* indices_int32_host = + reinterpret_cast(std::malloc(indices_int32_size)); + int64_t* indices_int64_host = + reinterpret_cast(std::malloc(indices_int64_size)); + int* label_int32_host = + reinterpret_cast(std::malloc(label_int32_size)); + int64_t* label_int64_host = + reinterpret_cast(std::malloc(label_int64_size)); + dev_ctx.Wait(); + memory::Copy(platform::CPUPlace(), indices_int64_host, + BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), + indices_data, indices_int64_size); + memory::Copy(platform::CPUPlace(), label_int64_host, + BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), + label_data, label_int64_size); + for (int i = 0; i < num_samples; ++i) { + label_int32_host[i] = label_int64_host[i]; + for (int j = 0; j < class_dim; ++j) { + indices_int32_host[i * class_dim + j] = + indices_int64_host[i * class_dim + j]; + } + } + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), + indices_int32_device, platform::CPUPlace(), indices_int32_host, + indices_int32_size); + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), + label_int32_device, platform::CPUPlace(), label_int32_host, + label_int32_size); + int r = xpu::accuracy(dev_ctx.x_context(), indices_int32_device, + label_int32_device, num_samples, class_dim, + correct_data, total_data, accuracy_data); + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::Fatal("XPU kernel error!")); + dev_ctx.Wait(); + xpu_free(indices_int32_device); + xpu_free(label_int32_device); + std::free(indices_int32_host); + std::free(indices_int64_host); + std::free(label_int32_host); + std::free(label_int64_host); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + accuracy, + ops::AccuracyXPUKernel); + +#endif diff --git a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake new file mode 100644 index 0000000000000..232626df02e50 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake @@ -0,0 +1,2 @@ +cc_test(test_mkldnn_op_nhwc SRCS mkldnn/test_mkldnn_op_nhwc.cc DEPS op_registry pool_op pooling transpose_op scope device_context enforce executor) + diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index bf12c61a4d9b1..72d2f779f800b 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -126,6 +126,9 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, data_dims, strides, ksize); + platform::PoolingMKLDNNHandler::ComputeAdaptivePoolParameters( + ctx, paddle::framework::vectorize(in_x->dims()), ksize, strides); + auto& dev_ctx = ctx.template device_context(); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc new file mode 100644 index 0000000000000..e7caeef85f5f9 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +USE_OP(pool2d); +USE_OP_DEVICE_KERNEL(pool2d, MKLDNN); +USE_OP(transpose); +USE_OP_DEVICE_KERNEL(transpose, MKLDNN); + +namespace paddle { +namespace operators { + +struct InputVars { + std::string name; + framework::LoDTensor *tensor; +}; + +TEST(test_pool2d_transpose_nhwc, cpu_place) { + framework::DDim dims({1, 4, 8, 512}); // NHWC shape + framework::DDim expected_dims({1, 7, 512, 3}); // NHWC expected shape + platform::CPUPlace p; + framework::Scope scope; + + InputVars input_name = {"x", + scope.Var("x")->GetMutable()}; + // Initialize input data + std::uniform_real_distribution dist(static_cast(10.0), + static_cast(20.0)); + std::mt19937 engine; + size_t numel = static_cast(framework::product(dims)); + input_name.tensor->Resize(dims); + auto data_ptr = input_name.tensor->mutable_data(p); + for (size_t i = 0; i < numel; ++i) { + data_ptr[i] = dist(engine); + } + + scope.Var("y")->GetMutable(); + auto *z = scope.Var("z")->GetMutable(); + + auto &pool = platform::DeviceContextPool::Instance(); + + // Make pool2d followed by transpose + + auto ksize = std::vector(2, 2); + auto op_pool = framework::OpRegistry::CreateOp( + "pool2d", {{"X", {"x"}}}, {{"Out", {"y"}}}, + {{"pooling_type", {std::string("max")}}, + {"ksize", {ksize}}, + {"data_format", {std::string("NHWC")}}, + {"use_mkldnn", {true}}}); + + auto axis = std::vector(4, 0); + axis[1] = 2; + axis[2] = 3; + axis[3] = 1; + auto op_transpose = framework::OpRegistry::CreateOp( + "transpose", {{"X", {"y"}}}, {{"Out", {"z"}}}, + {{"axis", {axis}}, {"use_mkldnn", {true}}}); + + op_pool->Run(scope, p); + op_transpose->Run(scope, p); + pool.Get(p)->Wait(); + + // Verify shape of output + PADDLE_ENFORCE_EQ(z->dims(), expected_dims, + platform::errors::InvalidArgument( + "Computed shape does not match expected shape")); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 1c75424fae7ef..8748078109f16 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -104,25 +104,29 @@ class NCEKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ( dist_probs->numel(), num_total_classes, - "ShapeError: The number of elements in Input(CustomDistProbs) " - "should be equal to the number of total classes. But Received: " - "Input(CustomDistProbs).numel() = %d, Attr(num_total_classes) " - "= %d.", - dist_probs->numel(), num_total_classes); + platform::errors::InvalidArgument( + "ShapeError: The number of elements in Input(CustomDistProbs) " + "should be equal to the number of total classes. But Received: " + "Input(CustomDistProbs).numel() = %d, Attr(num_total_classes) " + "= %d.", + dist_probs->numel(), num_total_classes)); PADDLE_ENFORCE_EQ( dist_alias->numel(), num_total_classes, - "ShapeError: The number of elements in Input(CustomDistAlias) " - "should be equal to the number of total classes. But Received: " - "Input(CustomDistAlias).numel() = %d, Attr(num_total_classes) " - "= %d.", - dist_alias->numel(), num_total_classes); + platform::errors::InvalidArgument( + "ShapeError: The number of elements in Input(CustomDistAlias) " + "should be equal to the number of total classes. But Received: " + "Input(CustomDistAlias).numel() = %d, Attr(num_total_classes) " + "= %d.", + dist_alias->numel(), num_total_classes)); PADDLE_ENFORCE_EQ( dist_alias_probs->numel(), num_total_classes, - "ShapeError: The number of elements in Input(CustomDistAliasProbs) " - "should be equal to the number of total classes. But Received: " - "Input(CustomDistAliasProbs).numel() = %d, " - "Attr(num_total_classes) = %d.", - dist_alias_probs->numel(), num_total_classes); + platform::errors::InvalidArgument( + "ShapeError: The number of elements in " + "Input(CustomDistAliasProbs) " + "should be equal to the number of total classes. But Received: " + "Input(CustomDistAliasProbs).numel() = %d, " + "Attr(num_total_classes) = %d.", + dist_alias_probs->numel(), num_total_classes)); const float *probs_data = dist_probs->data(); const int *alias_data = dist_alias->data(); @@ -140,10 +144,11 @@ class NCEKernel : public framework::OpKernel { for (int x = 0; x < sample_labels->numel(); x++) { PADDLE_ENFORCE_GE(sample_labels_data[x], 0, - "ValueError: Every sample label should be " - "non-negative. But received: " - "Input(SampleLabels)[%d] = %d", - x, sample_labels_data[x]); + platform::errors::InvalidArgument( + "ValueError: Every sample label should be " + "non-negative. But received: " + "Input(SampleLabels)[%d] = %d", + x, sample_labels_data[x])); } auto sample_out = context.Output("SampleLogits"); @@ -311,25 +316,29 @@ class NCEGradKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ( dist_probs->numel(), num_total_classes, - "ShapeError: The number of elements in Input(CustomDistProbs) " - "should be equal to the number of total classes. But Received: " - "Input(CustomDistProbs).numel() = %d, Attr(num_total_classes) " - "= %d.", - dist_probs->numel(), num_total_classes); + platform::errors::InvalidArgument( + "ShapeError: The number of elements in Input(CustomDistProbs) " + "should be equal to the number of total classes. But Received: " + "Input(CustomDistProbs).numel() = %d, Attr(num_total_classes) " + "= %d.", + dist_probs->numel(), num_total_classes)); PADDLE_ENFORCE_EQ( dist_alias->numel(), num_total_classes, - "ShapeError: The number of elements in Input(CustomDistAlias) " - "should be equal to the number of total classes. But Received: " - "Input(CustomDistAlias).numel() = %d, Attr(num_total_classes) " - "= %d.", - dist_alias->numel(), num_total_classes); + platform::errors::InvalidArgument( + "ShapeError: The number of elements in Input(CustomDistAlias) " + "should be equal to the number of total classes. But Received: " + "Input(CustomDistAlias).numel() = %d, Attr(num_total_classes) " + "= %d.", + dist_alias->numel(), num_total_classes)); PADDLE_ENFORCE_EQ( dist_alias_probs->numel(), num_total_classes, - "ShapeError: The number of elements in Input(CustomDistAliasProbs) " - "should be equal to the number of total classes. But Received: " - "Input(CustomDistAliasProbs).numel() = %d, " - "Attr(num_total_classes) = %d.", - dist_alias_probs->numel(), num_total_classes); + platform::errors::InvalidArgument( + "ShapeError: The number of elements in " + "Input(CustomDistAliasProbs) " + "should be equal to the number of total classes. But Received: " + "Input(CustomDistAliasProbs).numel() = %d, " + "Attr(num_total_classes) = %d.", + dist_alias_probs->numel(), num_total_classes)); const float *probs_data = dist_probs->data(); const int *alias_data = dist_alias->data(); diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc index e42c4666e110f..75d1b36c7d6a8 100644 --- a/paddle/fluid/operators/positive_negative_pair_op.cc +++ b/paddle/fluid/operators/positive_negative_pair_op.cc @@ -37,13 +37,15 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel { if (ctx->HasInput("AccumulatePositivePair") || ctx->HasInput("AccumulateNegativePair") || ctx->HasInput("AccumulateNeutralPair")) { - PADDLE_ENFORCE(ctx->HasInput("AccumulatePositivePair") && - ctx->HasInput("AccumulateNegativePair") && - ctx->HasInput("AccumulateNeutralPair"), - "All optional inputs(AccumulatePositivePair, " - "AccumulateNegativePair, AccumulateNeutralPair) of " - "PositiveNegativePairOp are required if one of them is " - "specified."); + PADDLE_ENFORCE_EQ( + ctx->HasInput("AccumulatePositivePair") && + ctx->HasInput("AccumulateNegativePair") && + ctx->HasInput("AccumulateNeutralPair"), + true, platform::errors::InvalidArgument( + "All optional inputs(AccumulatePositivePair, " + "AccumulateNegativePair, AccumulateNeutralPair) of " + "PositiveNegativePairOp are required if one of them " + "is specified.")); PADDLE_ENFORCE_EQ( ctx->GetInputDim("AccumulatePositivePair"), scalar_dim, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc index 7cd164bfd3a3d..9d2639c10301d 100644 --- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc +++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc @@ -32,7 +32,7 @@ class LogsumexpOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_LE(x_rank, 4, platform::errors::InvalidArgument( "The input tensor X's dimensions of logsumexp " - "should be less equal than 4. But received X's " + "should be less or equal than 4. But received X's " "dimensions = %d, X's shape = [%s].", x_rank, x_dims)); auto axis = ctx->Attrs().Get>("axis"); @@ -45,20 +45,18 @@ class LogsumexpOp : public framework::OperatorWithKernel { axis.size())); for (size_t i = 0; i < axis.size(); i++) { - PADDLE_ENFORCE_LT( - axis[i], x_rank, - platform::errors::InvalidArgument( - "axis[%d] should be in the " - "range [-dimension(X), dimension(X)] " - "where dimesion(X) is %d. But received axis[i] = %d.", - i, x_rank, axis[i])); - PADDLE_ENFORCE_GE( - axis[i], -x_rank, - platform::errors::InvalidArgument( - "axis[%d] should be in the " - "range [-dimension(X), dimension(X)] " - "where dimesion(X) is %d. But received axis[i] = %d.", - i, x_rank, axis[i])); + PADDLE_ENFORCE_LT(axis[i], x_rank, + platform::errors::InvalidArgument( + "axis[%d] should be in the " + "range [-D, D), where D is the dimensions of X and " + "D is %d. But received axis[%d] = %d.", + i, x_rank, i, axis[i])); + PADDLE_ENFORCE_GE(axis[i], -x_rank, + platform::errors::InvalidArgument( + "axis[%d] should be in the " + "range [-D, D), where D is the dimensions of X and " + "D is %d. But received axis[%d] = %d.", + i, x_rank, i, axis[i])); if (axis[i] < 0) { axis[i] += x_rank; } diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index e03824ca8c3f4..05bb37ee421ff 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -49,7 +49,8 @@ inline std::vector get_new_shape( "the element's shape must be [1]. But received the element's shape " "is [%s]", tensor->dims())); - if (platform::is_gpu_place(tensor->place())) { + if (platform::is_gpu_place(tensor->place()) || + platform::is_xpu_place(tensor->place())) { framework::Tensor temp; TensorCopySync(*tensor, platform::CPUPlace(), &temp); @@ -362,7 +363,8 @@ class ReshapeKernel { if (shape_tensor) { auto *shape_data = shape_tensor->data(); framework::Tensor cpu_shape_tensor; - if (platform::is_gpu_place(shape_tensor->place())) { + if (platform::is_gpu_place(shape_tensor->place()) || + platform::is_xpu_place(shape_tensor->place())) { TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor); shape_data = cpu_shape_tensor.data(); @@ -375,9 +377,22 @@ class ReshapeKernel { out->Resize(out_dims); out->mutable_data(ctx.GetPlace(), in->type()); - framework::TensorCopy( - *in, ctx.GetPlace(), - ctx.template device_context(), out); + +#ifdef PADDLE_WITH_XPU + if (platform::is_xpu_place(ctx.GetPlace())) { + auto &dev_ctx = + ctx.template device_context(); + xpu::memcpy_device( + dev_ctx.x_context(), out->data(), in->data(), + in->numel() * paddle::framework::SizeOfType(in->type())); + } else { +#endif + framework::TensorCopy( + *in, ctx.GetPlace(), + ctx.template device_context(), out); +#ifdef PADDLE_WITH_XPU + } +#endif out->Resize(out_dims); } }; @@ -644,3 +659,15 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, plat::float16, ops::ReshapeDoubleGradKernel); #endif + +#ifdef PADDLE_WITH_XPU +REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, + ops::ReshapeKernel, int, ops::ReshapeKernel, + int64_t, ops::ReshapeKernel, plat::float16, + ops::ReshapeKernel); +REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel, + double, ops::ReshapeGradKernel, int, + ops::ReshapeGradKernel, int64_t, + ops::ReshapeGradKernel, plat::float16, + ops::ReshapeGradKernel); +#endif diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc new file mode 100644 index 0000000000000..4002be8100152 --- /dev/null +++ b/paddle/fluid/operators/scale_op_xpu.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/scale_op.h" +#include +#include "paddle/fluid/platform/xpu_header.h" + +namespace paddle { +namespace operators { +template +class ScaleXPUKernel : public framework::OpKernel { + public: + virtual void Compute(const framework::ExecutionContext& ctx) const { + auto* in_var = ctx.InputVar("X"); + auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var); + auto scale = static_cast(ctx.Attr("scale")); + auto bias = static_cast(ctx.Attr("bias")); + auto bias_after_scale = ctx.Attr("bias_after_scale"); + auto* out_var = ctx.OutputVar("Out"); + if (in_var->IsType() && in_var != out_var) { + auto& in_slr = in_var->Get(); + auto* out_slr = out_var->GetMutable(); + out_slr->set_rows(in_slr.rows()); + out_slr->set_height(in_slr.height()); + } + auto* out = + framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var); + out->mutable_data(in->place()); + PADDLE_ENFORCE_EQ( + in->dims(), out->dims(), + platform::errors::InvalidArgument("In and out should have the same dim," + " expected %s, but got %s.", + in->dims().to_str().c_str(), + out->dims().to_str().c_str())); + auto& dev_ctx = ctx.template device_context(); + int r = xpu::scale(dev_ctx.x_context(), in->numel(), scale, bias, + bias_after_scale, in->data(), out->data()); + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::Fatal("XPU kernel error!")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + scale, ops::ScaleXPUKernel); + +#endif diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc index 99e8064d2446f..5f976685c982b 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc @@ -59,20 +59,22 @@ class SequenceConvOp : public framework::OperatorWithKernel { filter_dims[0], context_length * in_dims[1])); if (ctx->Attrs().Get("paddingTrainable")) { - PADDLE_ENFORCE( - ctx->HasInput("PaddingData"), - "Input(PaddingData) of SequenceConvOp should not be null."); + OP_INOUT_CHECK(ctx->HasInput("PaddingData"), "Input", "PaddingData", + "sequence_conv"); framework::DDim padding_dim = ctx->GetInputDim("PaddingData"); int up_pad = std::max(0, -context_start); int down_pad = std::max(0, context_start + context_length - 1); int total_pad = up_pad + down_pad; int input_width = static_cast(in_dims[1]); + bool start_equals_zero = context_start == 0; + bool length_equals_one = context_length == 1; + bool start_length = start_equals_zero && length_equals_one; - if (context_start == 0 && context_length == 1) { - PADDLE_THROW( - "If context_start is 0 and context_length is 1, paddingTrainable " - "should be false."); - } + PADDLE_ENFORCE_EQ( + start_length, false, + platform::errors::InvalidArgument( + "If context_start is 0 and context_length is 1, paddingTrainable " + "should be false.")); PADDLE_ENFORCE_EQ( padding_dim.size(), 2, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc index 1dbddfa709d72..758ff01b1e7ec 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc @@ -43,8 +43,11 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker { "Output LoDTensor of SequenceEnumerate operator."); AddAttr("win_size", "(int) The enumerate sequence window size.") .AddCustomChecker([](const int& win_size) { - PADDLE_ENFORCE(win_size >= 2, - "The window size should be not less than 2."); + PADDLE_ENFORCE_GE(win_size, 2, + platform::errors::InvalidArgument( + "The window size should be not less than 2." + "Received window size is %d", + win_size)); }); AddAttr("pad_value", "(int) The enumerate sequence padding value.") .SetDefault(0); diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu index d5deb7582c7c0..6d8f60ce932ab 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu @@ -58,7 +58,10 @@ class SequenceEnumerateOpCUDAKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ( static_cast(in_dims[0]), in_lod[0].back(), - "The actual input data's size mismatched with LoD information."); + platform::errors::InvalidArgument( + "The actual input data's size mismatched with LoD information." + "Received input data size is %d (actual) vs %d (loD information).", + static_cast(in_dims[0]), in_lod[0].back())); /* Generate enumerate sequence set */ auto stream = context.cuda_device_context().stream(); diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h index 4807521bc0d92..d104d33caebb3 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h @@ -29,21 +29,31 @@ class SequenceEnumerateKernel : public framework::OpKernel { int win_size = context.Attr("win_size"); auto pad_value = static_cast(context.Attr("pad_value")); - PADDLE_ENFORCE_EQ(in->lod().empty(), false, - "Input(X) Tensor of SequenceEnumerateOp does not contain " - "LoD information."); + PADDLE_ENFORCE_EQ( + in->lod().empty(), false, + platform::errors::InvalidArgument( + "Input(X) Tensor of SequenceEnumerateOp does not contain " + "LoD information.")); auto in_dims = in->dims(); auto lod0 = in->lod()[0]; PADDLE_ENFORCE_EQ( static_cast(in_dims[0]), lod0.back(), - "The actual input data's size mismatched with LoD information."); + platform::errors::InvalidArgument( + "The actual input data's size mismatched with LoD information." + "Received input data size is %d (actual) vs %d (loD information).", + static_cast(in_dims[0]), lod0.back())); PADDLE_ENFORCE_EQ( in_dims.size(), 2UL, - "Input(X) of SequenceEnumerate operator's rank should be 2."); + platform::errors::InvalidArgument( + "Input(X) of SequenceEnumerate operator's rank should be 2." + "Received %d instead.", + in_dims.size())); PADDLE_ENFORCE_EQ(in_dims[1], 1, - "Input(X) of SequenceEnumerate operator's 2nd " - "dimension should be 1."); + platform::errors::InvalidArgument( + "Input(X) of SequenceEnumerate operator's 2nd " + "dimension should be 1. Received %d instead.", + in_dims[1])); // Generate enumerate sequence set auto in_data = in->data(); diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc index b8912dd4c7960..b06b1f755a22b 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc @@ -69,8 +69,10 @@ class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker { "= max(Input(X)).") .SetDefault(-1) .AddCustomChecker([](const int& v) { - PADDLE_ENFORCE(v < 0 || v >= 1, - "Attr(maxlen) must be less than 0 or larger than 1"); + PADDLE_ENFORCE_EQ( + v < 0 || v >= 1, true, + platform::errors::InvalidArgument( + "Attr(maxlen) must be less than 0 or larger than 1")); }); AddAttr("out_dtype", "Output data type"); AddComment(R"DOC( diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h index 8fe68deca66aa..37f9caf76ceba 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h @@ -42,14 +42,22 @@ class SequencePoolKernel : public framework::OpKernel { "Input(X) Tensor of SequencePoolOp " "does not contain LoD information.")); PADDLE_ENFORCE_LE(lod_level, 2UL, - "The lod level of input shall be no more than 2."); + platform::errors::InvalidArgument( + "The lod level of input shall be no more than 2." + "Received lod level is %d.", + lod_level)); PADDLE_ENFORCE_GE( dims[0], /*batch size = */ static_cast(lod[lod_level - 1].size() - 1), - "The first dimension of Input(X) must be large than batch size."); + platform::errors::InvalidArgument( + "The first dimension of Input(X) must be large than batch size." + "But received first dimension of Input(X) is %d, while batch" + "size is %d.", + dims[0], static_cast(lod[lod_level - 1].size() - 1))); if (lod_level > 1UL) { PADDLE_ENFORCE_EQ(lod[0][lod[0].size() - 1], lod[1].size() - 1, - "The input lod information is illegal."); + platform::errors::InvalidArgument( + "The input lod information is illegal.")); framework::LoD out_lod; out_lod.push_back(lod[0]); out->set_lod(out_lod); diff --git a/paddle/fluid/operators/shape_op_xpu.cc b/paddle/fluid/operators/shape_op_xpu.cc new file mode 100644 index 0000000000000..2e9092a643253 --- /dev/null +++ b/paddle/fluid/operators/shape_op_xpu.cc @@ -0,0 +1,21 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/shape_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL(shape, ops::ShapeKernel, ops::ShapeKernel, + ops::ShapeKernel, ops::ShapeKernel, + ops::ShapeKernel); + +#endif diff --git a/paddle/fluid/operators/sign_op_xpu.cc b/paddle/fluid/operators/sign_op_xpu.cc new file mode 100644 index 0000000000000..44fd555544e7f --- /dev/null +++ b/paddle/fluid/operators/sign_op_xpu.cc @@ -0,0 +1,44 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/sign_op.h" +#include "paddle/fluid/platform/xpu_header.h" +namespace paddle { +namespace operators { + +template +class SignXPUKernel : public framework::OpKernel { + public: + virtual void Compute(const framework::ExecutionContext& context) const { + auto* out = context.Output("Out"); + auto* in = context.Input("X"); + out->mutable_data(in->place()); + auto xpu_context = context.device_context().x_context(); + int r = xpu::activation_forward(xpu_context, xpu::Activation_t::SIGN, + in->numel(), in->data(), out->data()); + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::Fatal("XPU kernel error!")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + sign, ops::SignXPUKernel); + +#endif diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc new file mode 100644 index 0000000000000..29740000aeb4c --- /dev/null +++ b/paddle/fluid/operators/softmax_op_xpu.cc @@ -0,0 +1,99 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/softmax_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; + +template +class SoftmaxXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + const int rank = x->dims().size(); + const int axis = CanonicalAxis(context.Attr("axis"), rank); + PADDLE_ENFORCE_EQ(axis == -1 || axis == rank - 1, true, + platform::errors::InvalidArgument( + "xpu softmax kernel only support last dimension of x " + "(axis==-1 or axis==x_dims-1), but received axis: " + "%d, x's shape: %s.", + axis, x->dims())); + + // allocate memory on device. + out->mutable_data(context.GetPlace()); + + const int n = SizeToAxis(axis, x->dims()); + const int d = SizeFromAxis(axis, x->dims()); + + auto& dev_ctx = context.template device_context(); + int r = xpu::softmax2d_forward(dev_ctx.x_context(), x->data(), + out->data(), n, d, d <= 2048); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU API(softmax2d_forward) return wrong " + "value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + r)); + } +}; + +template +class SoftmaxGradXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* out = context.Input("Out"); + auto* dout = context.Input(framework::GradVarName("Out")); + auto* dx = context.Output(framework::GradVarName("X")); + const int rank = dx->dims().size(); + const int axis = CanonicalAxis(context.Attr("axis"), rank); + + // allocate memory on device. + dx->mutable_data(context.GetPlace()); + + const int n = SizeToAxis(axis, dx->dims()); + const int d = SizeFromAxis(axis, dx->dims()); + + auto& dev_ctx = context.template device_context(); + int r = + xpu::softmax2d_backward(dev_ctx.x_context(), out->data(), + dout->data(), dx->data(), n, d); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU API(softmax2d_backward) return wrong " + "value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + r)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_XPU_KERNEL( + softmax, ops::SoftmaxXPUKernel); +REGISTER_OP_XPU_KERNEL( + softmax_grad, + ops::SoftmaxGradXPUKernel); + +#endif // PADDLE_WITH_XPU diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index ba56e5e36f985..3ac7a5a127b37 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -357,7 +357,8 @@ static void HardLabelSoftmaxWithCrossEntropy( CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4); CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2); default: - PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op"); + PADDLE_THROW(platform::errors::Unavailable( + "Block Dimension must be 2^n in softmax_with_cross_entropy_op.")); break; } #undef CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL @@ -397,7 +398,8 @@ static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data, CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4); CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2); default: - PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op"); + PADDLE_THROW(platform::errors::Unavailable( + "Block Dimension must be 2^n in softmax_with_cross_entropy_op.")); break; } @@ -408,8 +410,10 @@ template class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), - "This kernel only runs on GPU device."); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(context.GetPlace()), true, + platform::errors::Unavailable("softmax_with_cross_entropy operator's " + "CUDA kernel only runs on GPU device.")); const Tensor* logits = context.Input("Logits"); const Tensor* labels = context.Input("Label"); Tensor* softmax = context.Output("Softmax"); @@ -469,8 +473,10 @@ template class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), - "This kernel only runs on GPU device."); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(context.GetPlace()), true, + platform::errors::Unavailable("softmax_with_cross_entropy operator's " + "CUDA kernel only runs on GPU device.")); const Tensor* labels = context.Input("Label"); const T* loss_grad_data = context.Input(framework::GradVarName("Loss"))->data(); diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc index 93d8f42ce2175..479973a5daa5f 100644 --- a/paddle/fluid/operators/squeeze_op.cc +++ b/paddle/fluid/operators/squeeze_op.cc @@ -249,6 +249,19 @@ class Squeeze2GradOp : public framework::OperatorWithKernel { } }; +template +class SqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + void Apply(GradOpPtr grad_op) const override { + grad_op->SetType("squeeze"); + grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X"))); + grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out"))); + grad_op->SetAttrMap(this->Attrs()); + } +}; + // FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze, // the XShape is used to carry the shape and lod of X which will be used in // squeeze_grad, in this way, the framework can reuse the memory of X @@ -279,8 +292,22 @@ class Squeeze2GradOpMaker : public framework::SingleGradOpMaker { } }; -DECLARE_INPLACE_OP_INFERER(SequeezeInplaceInferer, {"X", "Out"}); -DECLARE_INPLACE_OP_INFERER(SequeezeGradInplaceInferer, +template +class Squeeze2DoubleGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + void Apply(GradOpPtr grad_op) const override { + grad_op->SetType("squeeze2"); + grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X"))); + grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out"))); + grad_op->SetOutput("XShape", this->Input("XShape")); + grad_op->SetAttrMap(this->Attrs()); + } +}; + +DECLARE_INPLACE_OP_INFERER(SqueezeInplaceInferer, {"X", "Out"}); +DECLARE_INPLACE_OP_INFERER(SqueezeGradInplaceInferer, {framework::GradVarName("Out"), framework::GradVarName("X")}); DECLARE_NO_NEED_BUFFER_VARS_INFERER(SqueezeGradNoNeedBufferVarsInferer, "X"); @@ -292,14 +319,18 @@ REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker, ops::SqueezeGradOpMaker, ops::SqueezeGradOpMaker); REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, + ops::SqueezeDoubleGradOpMaker, + ops::SqueezeDoubleGradOpMaker, ops::SqueezeGradNoNeedBufferVarsInferer); REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker, ops::Squeeze2GradOpMaker, ops::Squeeze2GradOpMaker, - ops::SequeezeInplaceInferer); + ops::SqueezeInplaceInferer); REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp, - ops::SequeezeGradInplaceInferer); + ops::Squeeze2DoubleGradOpMaker, + ops::Squeeze2DoubleGradOpMaker, + ops::SqueezeGradInplaceInferer); REGISTER_OP_CPU_KERNEL( squeeze, ops::SqueezeKernel, diff --git a/paddle/fluid/operators/sum_op_xpu.cc b/paddle/fluid/operators/sum_op_xpu.cc new file mode 100644 index 0000000000000..14928061d23dd --- /dev/null +++ b/paddle/fluid/operators/sum_op_xpu.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/sum_op.h" +#include +#include "paddle/fluid/platform/xpu_header.h" + +namespace paddle { +namespace operators { +using framework::Tensor; + +template +class SumXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto in_vars = context.MultiInputVar("X"); + auto out_var = context.OutputVar("Out"); + auto *out = context.Output("Out"); + bool in_place = out_var == in_vars[0]; + int N = in_vars.size(); + PADDLE_ENFORCE_EQ( + out_var->IsType(), true, + platform::errors::InvalidArgument("XPU only surpport LodTensor")); + if (!in_place) { + out->mutable_data(context.GetPlace()); + } + auto &dev_ctx = context.template device_context(); + std::vector ptrs(N, nullptr); + int valid_count = 0; + for (int i = 0; i < N; ++i) { + PADDLE_ENFORCE_EQ( + in_vars[i]->IsType(), true, + platform::errors::InvalidArgument("XPU only surpport LodTensor")); + auto &in_t = in_vars[i]->Get(); + if (in_t.numel() == 0) { + continue; + } + ptrs[valid_count] = reinterpret_cast(in_t.data()); + valid_count++; + } + int r = xpu::sum_batch(dev_ctx.x_context(), ptrs.data(), out->data(), + valid_count, out->numel()); + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::Fatal("XPU kernel error!")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_XPU_KERNEL( + sum, ops::SumXPUKernel); +#endif diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index 946fa6305d737..0e870937ec1a5 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -61,6 +61,19 @@ class TransposeOp : public framework::OperatorWithKernel { } framework::DDim out_dims(x_dims); +#ifdef PADDLE_WITH_MKLDNN + // Here we need to match dims to paddle layout + // as we are producing non-oneDNN result + if ((x_dims.size() >= 3) && + (paddle::platform::MKLDNNDeviceContext::tls() + .get_cur_paddle_data_layout() == framework::DataLayout::kNHWC)) { + auto dims = framework::vectorize(x_dims); + std::rotate(dims.begin() + 1, dims.begin() + 2, dims.end()); + x_dims = x_dims.reshape(dims); + VLOG(3) + << "Rotating Shape in Transpose from: kMKLDNN to: kNHWC output_shape"; + } +#endif for (size_t i = 0; i < axis_size; i++) { out_dims[i] = x_dims[axis[i]]; } diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc index ee1361e361830..0e58e1391cfab 100644 --- a/paddle/fluid/operators/unsqueeze_op.cc +++ b/paddle/fluid/operators/unsqueeze_op.cc @@ -228,6 +228,19 @@ class UnsqueezeGradOpMaker : public framework::SingleGradOpMaker { } }; +template +class UnsqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + void Apply(GradOpPtr grad_op) const override { + grad_op->SetType("unsqueeze"); + grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X"))); + grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out"))); + grad_op->SetAttrMap(this->Attrs()); + } +}; + // FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on // unsqueeze, the XShape is used to carry the shape and lod of X which // will be used in unsqueeze_grad, in this way, the framework can reuse @@ -304,6 +317,20 @@ class Unsqueeze2GradOp : public framework::OperatorWithKernel { } }; +template +class Unsqueeze2DoubleGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + void Apply(GradOpPtr grad_op) const override { + grad_op->SetType("unsqueeze2"); + grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X"))); + grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out"))); + grad_op->SetOutput("XShape", this->Input("XShape")); + grad_op->SetAttrMap(this->Attrs()); + } +}; + DECLARE_INPLACE_OP_INFERER(UnsqueezeInplaceInferer, {"X", "Out"}); DECLARE_INPLACE_OP_INFERER(UnsqueezeGradInplaceInferer, {framework::GradVarName("Out"), @@ -317,6 +344,8 @@ REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker, ops::UnsqueezeGradOpMaker, ops::UnsqueezeGradOpMaker); REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp, + ops::UnsqueezeDoubleGradOpMaker, + ops::UnsqueezeDoubleGradOpMaker, ops::UnsqueezeGradOpNoNeedBufferVarInferer); REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker, @@ -324,6 +353,8 @@ REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker, ops::Unsqueeze2GradOpMaker, ops::UnsqueezeInplaceInferer); REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp, + ops::Unsqueeze2DoubleGradOpMaker, + ops::Unsqueeze2DoubleGradOpMaker, ops::UnsqueezeGradInplaceInferer); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index e379832593c78..2df1f291f9f8c 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -164,6 +164,13 @@ bool MayIUse(const cpu_isa_t cpu_isa) { // AVX512F: EBX Bit 16 int avx512f_mask = (1 << 16); return (reg[1] & avx512f_mask) != 0; + } else if (cpu_isa == avx512_core) { + unsigned int avx512f_mask = (1 << 16); + unsigned int avx512dq_mask = (1 << 17); + unsigned int avx512bw_mask = (1 << 30); + unsigned int avx512vl_mask = (1 << 31); + return ((reg[1] & avx512f_mask) && (reg[1] & avx512dq_mask) && + (reg[1] & avx512bw_mask) && (reg[1] & avx512vl_mask)); } } #endif diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index a3ae9e48eea30..165321d9c87ff 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -47,6 +47,10 @@ limitations under the License. */ #include #include +#if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL) +#include +#endif + #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include "glog/logging.h" #include "paddle/fluid/platform/errors.h" @@ -236,13 +240,14 @@ inline std::string SimplifyDemangleStr(std::string str) { } inline std::string GetCurrentTraceBackString() { - static constexpr int TRACE_STACK_LIMIT = 100; std::ostringstream sout; sout << "\n\n--------------------------------------\n"; sout << "C++ Traceback (most recent call last):"; sout << "\n--------------------------------------\n"; -#if !defined(_WIN32) +#if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL) + static constexpr int TRACE_STACK_LIMIT = 100; + void* call_stack[TRACE_STACK_LIMIT]; auto size = backtrace(call_stack, TRACE_STACK_LIMIT); auto symbols = backtrace_symbols(call_stack, size); @@ -261,7 +266,7 @@ inline std::string GetCurrentTraceBackString() { } free(symbols); #else - sout << "Windows not support stack backtrace yet.\n"; + sout << "Not support stack backtrace yet.\n"; #endif return sout.str(); } diff --git a/paddle/fluid/platform/macros.h b/paddle/fluid/platform/macros.h index 32b7efc04c1f2..fb5cf9fb31915 100644 --- a/paddle/fluid/platform/macros.h +++ b/paddle/fluid/platform/macros.h @@ -25,6 +25,8 @@ limitations under the License. */ classname& operator=(classname&&) = delete #endif +#ifndef PADDLE_WITH_MUSL #if defined(__FLT_MAX__) #define FLT_MAX __FLT_MAX__ #endif // __FLT_MAX__ +#endif // PADDLE_WITH_MUSL diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index b012a103ea303..d8dd166f325c8 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once #include +#include #include +#include #include #include #include @@ -81,12 +83,30 @@ inline void MatchShapeToLayout(framework::Tensor* tensor_in, return; } + auto print_dims = [](const std::vector& dims) { + std::ostringstream oss; + + if (!dims.empty()) { + oss << "["; + // Convert all but the last element to avoid a trailing "," + std::copy(dims.begin(), dims.end() - 1, + std::ostream_iterator(oss, ",")); + + // Now add the last element with no delimiter + oss << dims.back() << "]"; + } + + return oss.str(); + }; + switch (from) { case framework::DataLayout::kMKLDNN: if (to == framework::DataLayout::kNHWC) { auto dims = framework::vectorize(tensor_in->dims()); std::rotate(dims.begin() + 1, dims.begin() + 2, dims.end()); tensor_in->Resize(framework::make_ddim(dims)); + VLOG(3) << "Rotating Shape from: kMKLDNN to: kNHWC output_shape" + << print_dims(dims); } break; case framework::DataLayout::kNHWC: @@ -94,6 +114,8 @@ inline void MatchShapeToLayout(framework::Tensor* tensor_in, auto dims = framework::vectorize(tensor_in->dims()); std::rotate(dims.begin() + 1, dims.end() - 1, dims.end()); tensor_in->Resize(framework::make_ddim(dims)); + VLOG(3) << "Rotating Shape from: kNHWC to: kMKLDNN output_shape" + << print_dims(dims); } break; default: diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index d1c5480c0f543..785627a09fb27 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -853,6 +853,9 @@ class PoolingMKLDNNHandler : public MKLDNNHandlerTAcquireForwardPrimitiveDescriptor( is_test ? mkldnn::prop_kind::forward_inference : mkldnn::prop_kind::forward_training, @@ -919,6 +922,27 @@ class PoolingMKLDNNHandler : public MKLDNNHandlerT& src_tz, std::vector& ksize, + std::vector& strides) { + if (ctx.Attr("adaptive")) { + // (jczaja): oneDNN is supporting only unchangable in size pool window + PADDLE_ENFORCE_EQ( + src_tz[src_tz.size() - 1] % ksize[1], 0, + platform::errors::Unimplemented( + "Input dim must be divisible by corressponding ksize dim.")); + PADDLE_ENFORCE_EQ( + src_tz[src_tz.size() - 2] % ksize[0], 0, + platform::errors::Unimplemented( + "Input dim must be divisible by corressponding ksize dim.")); + ksize[0] = src_tz[src_tz.size() - 2] / ksize[0]; + ksize[1] = src_tz[src_tz.size() - 1] / ksize[1]; + strides[0] = ksize[0]; + strides[1] = ksize[1]; + } + } + private: static inline int ComputeCeiledOutput(int input_size, int kernel_size, int padding, int stride) { diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index c1b81159aca97..c5e8ff807a2d3 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -14,19 +14,18 @@ #pragma once -#include -#include - #include + +#include #include +#include #include #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include "glog/logging.h" #if !defined(_WIN32) -#include // dladdr -#include // backtrace +#include // dladdr #include #include #include // std::accumulate diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index be4d90597e1e1..c8e5048421cca 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -481,8 +481,8 @@ void BindAnalysisConfig(py::module *m) { py::arg("disable_trt_plugin_fp16") = false) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine, - py::arg("zero_copy") = false, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, + py::arg("zero_copy") = false, py::arg("passes_filter") = std::vector(), py::arg("ops_filter") = std::vector()) .def("lite_engine_enabled", &AnalysisConfig::lite_engine_enabled) diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 9bc603c0ecc2c..ee6e541c9e6c6 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -49,6 +49,8 @@ std::map> op_ins_map = { {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}}, {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}}, {"warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}}, + {"hierarchical_sigmoid", + {"X", "W", "Label", "PathTable", "PathCode", "Bias"}}, }; // NOTE(zhiqiu): Like op_ins_map. diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 0929febc4d46f..0ee725c302215 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -36,9 +36,9 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/op_compatible_info.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/reader.h" @@ -142,6 +142,17 @@ bool IsCompiledWithMKLDNN() { #endif } +bool SupportsBfloat16() { +#ifndef PADDLE_WITH_MKLDNN + return false; +#else + if (platform::MayIUse(platform::cpu_isa_t::avx512_core)) + return true; + else + return false; +#endif +} + bool IsCompiledWithBrpc() { #ifndef PADDLE_WITH_DISTRIBUTE return false; @@ -421,10 +432,12 @@ PYBIND11_MODULE(core_noavx, m) { return map_output; }); - m.def("save_op_compatible_info", [](framework::ProgramDesc &desc) { - framework::OpCompatibleMap op_compatible_map; - op_compatible_map.InitOpCompatibleMap(); - return op_compatible_map.ConvertToProto(desc.OpCompatibleMap()); + m.def("save_op_version_info", [](framework::ProgramDesc &desc) { + framework::compatible::pb::OpVersionMap pb_vmap{desc.OpVersionMap()}; + framework::compatible::SaveOpVersions( + framework::compatible::OpVersionRegistrar::GetInstance() + .GetVersionMap(), + &pb_vmap); }); m.def( @@ -1302,9 +1315,6 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Communicator").def(py::init<>()); #endif py::class_(m, "CUDAPlace", R"DOC( - **Note**: - For multi-card tasks, please use `FLAGS_selected_gpus` environment variable to set the visible GPU device. - The next version will fix the problem with `CUDA_VISIBLE_DEVICES` environment variable. CUDAPlace is a descriptor of a device. It represents a GPU device allocated or to be allocated with Tensor or LoDTensor. @@ -1323,8 +1333,10 @@ All parameter, weight, gradient are variables in Paddle. Examples: .. code-block:: python - import paddle.fluid as fluid - gpu_place = fluid.CUDAPlace(0) + import paddle + + place = paddle.CUDAPlace(0) + paddle.disable_static(place) )DOC") .def("__init__", @@ -1661,6 +1673,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_xpu", IsCompiledWithXPU); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); + m.def("supports_bfloat16", SupportsBfloat16); m.def("is_compiled_with_brpc", IsCompiledWithBrpc); m.def("is_compiled_with_dist", IsCompiledWithDIST); m.def("_cuda_synchronize", [](const platform::CUDAPlace &place) { diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt index d587081fbac8a..ad4bc20f9f0b1 100644 --- a/paddle/fluid/train/CMakeLists.txt +++ b/paddle/fluid/train/CMakeLists.txt @@ -4,37 +4,26 @@ function(train_test TARGET_NAME) set(multiValueArgs ARGS) cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(arg_list "") - if(train_test_ARGS) - foreach(arg ${train_test_ARGS}) - list(APPEND arg_list "_${arg}") - endforeach() + if (NOT APPLE AND NOT WIN32) + cc_test(test_train_${TARGET_NAME} + SRCS test_train_${TARGET_NAME}.cc + DEPS paddle_fluid_shared + ARGS --dirname=${PYTHON_TESTS_DIR}/book/) else() - list(APPEND arg_list "_") + cc_test(test_train_${TARGET_NAME}${arg} + SRCS test_train_${TARGET_NAME}.cc + DEPS paddle_fluid_api + ARGS --dirname=${PYTHON_TESTS_DIR}/book/) + endif() + set_tests_properties(test_train_${TARGET_NAME} + PROPERTIES FIXTURES_REQUIRED test_${TARGET_NAME}_infer_model) + if(NOT WIN32 AND NOT APPLE) + set_tests_properties(test_train_${TARGET_NAME} + PROPERTIES TIMEOUT 150) endif() - foreach(arg ${arg_list}) - string(REGEX REPLACE "^_$" "" arg "${arg}") - if (NOT APPLE AND NOT WIN32) - cc_test(test_train_${TARGET_NAME}${arg} - SRCS test_train_${TARGET_NAME}.cc - DEPS paddle_fluid_shared - ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/) - else() - cc_test(test_train_${TARGET_NAME}${arg} - SRCS test_train_${TARGET_NAME}.cc - DEPS paddle_fluid_api - ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/) - endif() - set_tests_properties(test_train_${TARGET_NAME}${arg} - PROPERTIES FIXTURES_REQUIRED test_${TARGET_NAME}_infer_model) - if(NOT WIN32 AND NOT APPLE) - set_tests_properties(test_train_${TARGET_NAME}${arg} - PROPERTIES TIMEOUT 150) - endif() - endforeach() endfunction(train_test) if(WITH_TESTING) - train_test(recognize_digits ARGS mlp conv) + train_test(recognize_digits) endif() diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc index e7b698e1a34e2..fb993439bb8e4 100644 --- a/paddle/fluid/train/test_train_recognize_digits.cc +++ b/paddle/fluid/train/test_train_recognize_digits.cc @@ -32,16 +32,15 @@ DEFINE_string(dirname, "", "Directory of the train model."); namespace paddle { -void Train() { - CHECK(!FLAGS_dirname.empty()); +void Train(std::string model_dir) { framework::InitDevices(false); const auto cpu_place = platform::CPUPlace(); framework::Executor executor(cpu_place); framework::Scope scope; auto train_program = inference::Load( - &executor, &scope, FLAGS_dirname + "__model_combined__.main_program", - FLAGS_dirname + "__params_combined__"); + &executor, &scope, model_dir + "__model_combined__.main_program", + model_dir + "__params_combined__"); std::string loss_name = ""; for (auto op_desc : train_program->Block(0).AllOps()) { @@ -87,6 +86,10 @@ void Train() { EXPECT_LT(last_loss, first_loss); } -TEST(train, recognize_digits) { Train(); } +TEST(train, recognize_digits) { + CHECK(!FLAGS_dirname.empty()); + Train(FLAGS_dirname + "recognize_digits_mlp.train.model/"); + Train(FLAGS_dirname + "recognize_digits_conv.train.model/"); +} } // namespace paddle diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 84713d513fb68..0af32da4e690b 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -235,7 +235,6 @@ from .framework import no_grad #DEFINE_ALIAS from .framework import save #DEFINE_ALIAS from .framework import load #DEFINE_ALIAS -from .framework import SaveLoadConfig #DEFINE_ALIAS from .framework import DataParallel #DEFINE_ALIAS from .framework import NoamDecay #DEFINE_ALIAS @@ -272,6 +271,7 @@ from . import jit from . import static +from . import amp # high-level api from .hapi import Model diff --git a/python/paddle/amp/__init__.py b/python/paddle/amp/__init__.py new file mode 100644 index 0000000000000..32587938512c4 --- /dev/null +++ b/python/paddle/amp/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .auto_cast import auto_cast +from .grad_scaler import GradScaler + +__all__ = ['auto_cast', 'GradScaler'] diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py new file mode 100644 index 0000000000000..e33f6e2afc846 --- /dev/null +++ b/python/paddle/amp/auto_cast.py @@ -0,0 +1,52 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid.dygraph.amp import amp_guard + +__all__ = ['auto_cast'] + + +def auto_cast(enable=True, custom_white_list=None, custom_black_list=None): + """ + Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode. + If enabled, the input data type (float32 or float16) of each operator is decided + by autocast algorithm for better performance. + + Commonly, it is used together with `AmpScaler` to achieve Auto-Mixed-Precision in + imperative mode. + + Args: + enable(bool, optional): Enable auto-mixed-precision or not. Default is True. + custom_white_list(set|list, optional): The custom white_list. + custom_black_list(set|list, optional): The custom black_list. + + Examples: + + .. code-block:: python + + import paddle + + conv2d = paddle.nn.Conv2d(3, 2, 3, bias_attr=False) + data = paddle.rand([10, 3, 32, 32]) + + with paddle.amp.auto_cast(): + conv = conv2d(data) + print(conv.dtype) # FP16 + + with paddle.amp.auto_cast(enable=False): + conv = conv2d(data) + print(conv.dtype) # FP32 + + """ + return amp_guard(enable, custom_white_list, custom_black_list) diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py new file mode 100644 index 0000000000000..9476f3765b3bc --- /dev/null +++ b/python/paddle/amp/grad_scaler.py @@ -0,0 +1,136 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid.dygraph.amp import AmpScaler + +__all__ = ['GradScaler'] + + +class GradScaler(AmpScaler): + """ + GradScaler is used for Auto-Mixed-Precision training/inferring in dynamic graph + mode. It controls the scaling of loss, helps avoiding numerical overflow. + The object of this class has two methods `scale()`, `minimize()`. + + `scale()` is used to multiply the loss by a scale ratio. + `minimize()` is similar as `Optimizer.minimize()`, performs parameters updating. + + Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in + dynamic graph mode. + + Args: + enable(bool, optional): Enable loss scaling or not. Default is True. + init_loss_scaling (float, optional): The initial loss scaling factor. Default is 2**15. + incr_ratio(float, optional): The multiplier to use when increasing the loss + scaling. Default is 2.0. + decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing + the loss scaling. Default is 0.5. + incr_every_n_steps(int, optional): Increases loss scaling every n consecutive + steps with finite gradients. Default is 1000. + decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n + accumulated steps with nan or inf gradients. Default is 2. + use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True. + Returns: + An AmpScaler object. + + Examples: + + .. code-block:: python + + import paddle + + model = paddle.nn.Conv2d(3, 2, 3, bias_attr=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = paddle.rand([10, 3, 32, 32]) + with paddle.amp.auto_cast(): + conv = model(data) + loss = paddle.reduce_mean(conv) + scaled = scaler.scale(loss) # scale the loss + scaled.backward() # do backward + scaler.minimize(optimizer, scaled) # update parameters + """ + + def __init__(self, + enable=True, + init_loss_scaling=2.**15, + incr_ratio=2.0, + decr_ratio=0.5, + incr_every_n_steps=1000, + decr_every_n_nan_or_inf=1, + use_dynamic_loss_scaling=True): + super(GradScaler, self).__init__(enable, init_loss_scaling, incr_ratio, + decr_ratio, incr_every_n_steps, + decr_every_n_nan_or_inf, + use_dynamic_loss_scaling) + + def scale(self, var): + """ + Multiplies a Tensor by the scale factor and returns scaled outputs. + If this instance of :class:`GradScaler` is not enabled, output are returned unmodified. + + Args: + var (Tensor): The tensor to scale. + Returns: + The scaled tensor or original tensor. + + Examples: + .. code-block:: python + + import paddle + + model = paddle.nn.Conv2d(3, 2, 3, bias_attr=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = paddle.rand([10, 3, 32, 32]) + with paddle.amp.auto_cast(): + conv = model(data) + loss = paddle.reduce_mean(conv) + scaled = scaler.scale(loss) # scale the loss + scaled.backward() # do backward + scaler.minimize(optimizer, scaled) # update parameters + """ + return super(GradScaler, self).scale(var) + + def minimize(self, optimizer, *args, **kwargs): + """ + This function is similar as `Optimizer.minimize()`, which performs parameters updating. + + If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped. + Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters. + + Finally, the loss scaling ratio is updated. + + Args: + optimizer(Optimizer): The optimizer used to update parameters. + args: Arguments, which will be forward to `optimizer.minimize()`. + kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`. + + Examples: + .. code-block:: python + + import paddle + + model = paddle.nn.Conv2d(3, 2, 3, bias_attr=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = paddle.rand([10, 3, 32, 32]) + with paddle.amp.auto_cast(): + conv = model(data) + loss = paddle.reduce_mean(conv) + scaled = scaler.scale(loss) # scale the loss + scaled.backward() # do backward + scaler.minimize(optimizer, scaled) # update parameters + """ + return super(GradScaler, self).minimize(optimizer, *args, **kwargs) diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index 1fc29ad042883..c7798b15c67fe 100755 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -744,13 +744,13 @@ def adaptive_localsgd(self): strategy.adaptive_localsgd = True # by default this is false """ - return self.strategy.localsgd + return self.strategy.adaptive_localsgd @adaptive_localsgd.setter @is_strict_auto def adaptive_localsgd(self, flag): if isinstance(flag, bool): - self.strategy.localsgd = flag + self.strategy.adaptive_localsgd = flag else: print("WARNING: adaptive_localsgd should have value of bool type") diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 3fdd6e9248303..7eb3a5659654a 100644 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -187,6 +187,8 @@ def init(self, role_maker=None, is_collective=False): self.strategy_compiler = StrategyCompiler() if paddle.fluid.framework.in_dygraph_mode(): + if self.worker_num() == 1: + return if parallel_helper._is_parallel_ctx_initialized(): warnings.warn( "The dygraph parallel environment has been initialized.") diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index deba3b4a17d1b..ce9826d7e59ae 100644 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -530,13 +530,6 @@ def _get_heter_worker_endpoint(self): return self._heter_trainer_endpoints[(self._current_id) % self._heter_worker_num()] - def _get_heter_worker_device(self): - """ - Returns: - string: heter_trainer's device of current node, e.g: CPU/GPU/XPU - """ - return self._heter_trainer_device.upper() - class PaddleCloudRoleMaker(RoleMakerBase): def __init__(self, is_collective=False, **kwargs): @@ -677,88 +670,99 @@ def _is_heter_worker(self): return self._role == Role.HETER_WORKER def _ps_env(self): - try: - # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set - # format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002 - self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST") - - if self._server_endpoints is None: - # back to non_distributed execution. - self._server_endpoints = "" - self._trainers_num = 1 - self._role = Role.WORKER - self._current_id = 0 - self._nodes_num = 1 - self._heter_trainers_num = 0 - self._heter_trainer_endpoints = None - self._non_distributed = True - return - - self._server_endpoints = self._server_endpoints.split(",") - - self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS") - if self._worker_endpoints: - self._worker_endpoints = self._worker_endpoints.split(",") - else: - self._worker_endpoints = [] + # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set + # format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002 + self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST", None) + + if self._server_endpoints is None: + # back to non_distributed execution. + self._server_endpoints = "" + self._trainers_num = 1 + self._role = Role.WORKER + self._current_id = 0 + self._nodes_num = 1 + self._heter_trainers_num = 0 + self._heter_trainer_endpoints = None + self._non_distributed = True + return + + self._server_endpoints = self._server_endpoints.split(",") + + self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", None) + if self._worker_endpoints != None: + self._worker_endpoints = self._worker_endpoints.split(",") + else: + self._worker_endpoints = [] + + trainers_num = os.getenv("PADDLE_TRAINERS_NUM", None) + if trainers_num == None: + raise ValueError( + "Can not find PADDLE_TRAINERS_NUM, please check your environment." + ) + trainers_num = int(trainers_num) - trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"]) - training_role = os.environ["TRAINING_ROLE"] + training_role = os.getenv("TRAINING_ROLE", None) + if training_role == None: + raise ValueError( + "Can not find TRAINING_ROLE, please check your environment.") - if training_role not in ["TRAINER", "PSERVER", "HETER_TRAINER"]: + if training_role not in ["TRAINER", "PSERVER", "HETER_TRAINER"]: + raise ValueError( + "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER, but get {}, please check your environment.". + format(training_role)) + + # For heter parameter server env setting + heter_trainer_eplist = os.getenv("PADDLE_HETER_TRAINER_IP_PORT_LIST", + "") + if heter_trainer_eplist != "": + try: + heter_trainer_eplist = os.environ[ + "PADDLE_HETER_TRAINER_IP_PORT_LIST"].split(",") + except: raise ValueError( - "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER, but get {}, please check your environment.". - format(training_role)) - - # For heter parameter server env setting - heter_trainer_eplist = os.getenv( - "PADDLE_HETER_TRAINER_IP_PORT_LIST", None) - heter_trainer_device = os.getenv("PADDLE_HETER_TRAINER_DEVICE", - None) - if heter_trainer_eplist and heter_trainer_device: - try: - heter_trainer_eplist = os.environ[ - "PADDLE_HETER_TRAINER_IP_PORT_LIST"].split(",") - except: - raise ValueError( - "Can not Find PADDLE_HETER_TRAINER_IP_PORT_LIST in env or its format doesn't match the requirement: 'IP:PORT,IP:PORT' ." - ) - - self._is_heter_parameter_server_mode = True - heter_trainers_num = len(heter_trainer_eplist) - current_node_device = heter_trainer_device.upper() - if current_node_device not in ["CPU", "GPU", "XPU"]: - raise ValueError( - "Heter Trainer doesn't support {} device now, please use CPU / GPU / XPU(KunLun)". - format(heter_trainer_device)) - self._heter_trainer_device = current_node_device - else: - self._is_heter_parameter_server_mode = False - heter_trainers_num = 0 - - if training_role == "TRAINER": - role = Role.WORKER - current_id = int(os.environ["PADDLE_TRAINER_ID"]) - if len(self._worker_endpoints) > 0: - self._cur_endpoint = self._worker_endpoints[current_id] - elif training_role == "PSERVER": - role = Role.SERVER - port = os.environ["PADDLE_PORT"] - ip = os.environ["POD_IP"] - self._cur_endpoint = ip + ":" + port - current_id = self._server_endpoints.index(self._cur_endpoint) - elif training_role == "HETER_TRAINER": - role = Role.HETER_WORKER - cur_ip = os.environ["POD_IP"] - cur_port = os.environ["PADDLE_PORT"] - curr_endpoint = ":".join([cur_ip, cur_port]) - current_id = heter_trainer_eplist.index(curr_endpoint) - else: + "Can not Find PADDLE_HETER_TRAINER_IP_PORT_LIST in env or its format doesn't match the requirement: 'IP:PORT,IP:PORT' ." + ) + + self._is_heter_parameter_server_mode = True + heter_trainers_num = len(heter_trainer_eplist) + else: + self._is_heter_parameter_server_mode = False + heter_trainers_num = 0 + + if training_role == "TRAINER": + role = Role.WORKER + current_id = os.getenv("PADDLE_TRAINER_ID", None) + if current_id == None: raise ValueError( - "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER") - except ValueError as e: - raise ValueError( - "Something wrong with PaddleCloud, please check environment") + "Can not find PADDLE_TRAINER_ID, please check your environment." + ) + current_id = int(current_id) + if len(self._worker_endpoints) > 0: + self._cur_endpoint = self._worker_endpoints[current_id] + elif training_role == "PSERVER": + role = Role.SERVER + port = os.getenv("PADDLE_PORT", None) + if port == None: + raise ValueError( + "Can not find PADDLE_PORT, please check your environment.") + ip = os.getenv("POD_IP", None) + if ip == None: + raise ValueError( + "Can not find POD_IP, please check your environment.") + self._cur_endpoint = ip + ":" + port + current_id = self._server_endpoints.index(self._cur_endpoint) + elif training_role == "HETER_TRAINER": + role = Role.HETER_WORKER + cur_port = os.getenv("PADDLE_PORT", None) + if cur_port == None: + raise ValueError( + "Can not find PADDLE_PORT, please check your environment.") + cur_ip = os.getenv("POD_IP", None) + if cur_ip == None: + raise ValueError( + "Can not find POD_IP, please check your environment.") + curr_endpoint = ":".join([cur_ip, cur_port]) + current_id = heter_trainer_eplist.index(curr_endpoint) self._trainers_num = trainers_num self._role = role diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index 015d59b516e94..2e23a915454fa 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -89,14 +89,16 @@ def _parse_args(): description='''start paddle training using multi-process mode. see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2- ''') + base_group = parser.add_argument_group("Base Parameters") - # Optional arguments for the launch helper - parser.add_argument( - "--ips", + base_group.add_argument( + "--log_dir", type=str, - default="127.0.0.1", - help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..") - parser.add_argument( + default="log", + help="The path for each process's log.If it's not set, the log will printed to default pipe." + ) + + base_group.add_argument( "--gpus", type=str, default=None, @@ -104,22 +106,7 @@ def _parse_args(): "each process is bound to a single GPU. And if it's not set, this module will use all the gpu cards for training." ) - parser.add_argument( - "--servers", type=str, default="", help="User defined servers ip:port") - parser.add_argument( - "--workers", type=str, default="", help="User defined workers ip:port") - parser.add_argument("--worker_num", type=int, help="number of workers") - - parser.add_argument("--server_num", type=int, help="number of servers") - - parser.add_argument( - "--log_dir", - type=str, - default="log", - help="The path for each process's log.If it's not set, the log will printed to default pipe." - ) - # positional - parser.add_argument( + base_group.add_argument( "training_script", type=str, help="The full path to the single GPU training " @@ -127,8 +114,34 @@ def _parse_args(): "followed by all the arguments for the " "training script") - # rest from the training program - parser.add_argument('training_script_args', nargs=REMAINDER) + base_group.add_argument('training_script_args', nargs=REMAINDER) + + # Optional arguments for the launch helper + # for collective + collective_group = parser.add_argument_group("Collective Parameters") + collective_group.add_argument( + "--ips", + type=str, + default="127.0.0.1", + help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..") + + ps_group = parser.add_argument_group("Parameter-Server Parameters") + # for parameter server + ps_group.add_argument( + "--servers", type=str, default="", help="User defined servers ip:port") + ps_group.add_argument( + "--workers", type=str, default="", help="User defined workers ip:port") + ps_group.add_argument( + "--heter_workers", + type=str, + default="", + help="User defined heter workers ip:port") + + ps_group.add_argument("--worker_num", type=int, help="number of workers") + ps_group.add_argument("--server_num", type=int, help="number of servers") + ps_group.add_argument( + "--heter_worker_num", type=int, help="number of heter_workers") + return parser.parse_args() @@ -166,35 +179,6 @@ def get_cluster_from_args(args, gpus): return get_cluster(node_ips, node_ip, trainer_endpoints, gpus) -def get_gpus(gpus): - if gpus is None: - gpus_num = fluid.core.get_cuda_device_count() - res_gpus = [str(x) for x in range(0, gpus_num)] - else: - cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") - if cuda_visible_devices is None or cuda_visible_devices == "": - res_gpus = [x.strip() for x in gpus.split(',')] - else: - # change gpus into relative values - # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.gpus=4,5,6,7; - # therefore gpus=0,1,2,3 - cuda_visible_devices_list = cuda_visible_devices.split(',') - for x in gpus.split(','): - assert x in cuda_visible_devices_list, "Can't find "\ - "your gpus %s in CUDA_VISIBLE_DEVICES[%s]."\ - % (x, cuda_visible_devices) - res_gpus = [ - cuda_visible_devices_list.index(x.strip()) - for x in gpus.split(',') - ] - logger.info("Change selected_gpus into reletive values. --ips:{} " - "will change into relative_ips:{} according to your " - "CUDA_VISIBLE_DEVICES:{}".format( - gpus, res_gpus, cuda_visible_devices_list)) - - return res_gpus - - def launch_collective(args): # parse arguments, used for cloud-single-machine and local gpus = get_gpus(args.gpus) @@ -245,209 +229,37 @@ def launch_collective(args): shutil.rmtree(gloo_rendezvous_dir) -def launch_ps(args): - ports = None - start_port = 6170 - if args.server_num: - server_num = args.server_num - ports = get_ports(server_num, 0) - server_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports]) - else: - assert args.servers != "", "The setting of CPU mode must be either server_num or servers." - server_endpoints = args.servers - server_endpoints_ips = [ - x.strip().split(":")[0] for x in server_endpoints.split(",") - ] - server_endpoints_port = [ - x.strip().split(":")[1] for x in server_endpoints.split(",") +def launch_ps(args, distribute_mode): + cloud_flag = cloud_utils.use_paddlecloud() + + # for ps-cpu on paddlecloud + if cloud_flag and distribute_mode == DistributeMode.PS: + direct_start(args) + return + elif cloud_flag and distribute_mode == DistributeMode.PS_HETER: + cloud_ps_heter_env_set(args) + args.workers = os.getenv("PADDLE_TRAINER_ENDPOINTS") + args.servers = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST") + args.heter_workers = os.getenv("PADDLE_HETER_TRAINER_IP_PORT_LIST") + + ps_launcher = ParameterServerLauncher(args, distribute_mode) + ps_launcher.start_ps() + return + + +def which_distributed_mode(args): + ps_args = [ + '--worker_num', + '--server_num', + '--heter_worker_num', + '--servers', + '--workers', + '--heter_workers', ] - server_num = len(server_endpoints_ips) - - if args.worker_num: - worker_num = args.worker_num - ports = get_ports(worker_num, server_num) - worker_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports]) - else: - assert args.workers != "", "The setting of CPU mode must be either worker_num or workers." - worker_endpoints = args.workers - worker_endpoints_ips = [ - x.strip().split(":")[0] for x in worker_endpoints.split(",") - ] - worker_num = len(worker_endpoints_ips) - node_ips = list(set(server_endpoints_ips + worker_endpoints_ips)) - worker_endpoints_len = [ - len(x.strip().split(":")) for x in worker_endpoints.split(",") - ] - if 1 in worker_endpoints_len: - # if no port value in worker_endpoints, will set default port values. - worker_endpoints_port = range(start_port + server_num, - start_port + server_num + worker_num, 1) - else: - worker_endpoints_port = [ - x.strip().split(":")[1] for x in worker_endpoints.split(",") - ] - - # local train - if len(set(node_ips)) == 1: - current_node_ip = node_ips[0] - else: - _, current_node_ip = get_host_name_ip() - - assert current_node_ip in node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \ - % (current_node_ip, node_ips) - node_rank = node_ips.index(current_node_ip) - logger.debug( - "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}, server_ports:{}". - format(node_ips, current_node_ip, node_rank, server_endpoints_port)) - - cluster = Cluster(hdfs=None) - server_rank = 0 - worker_rank = 0 - for node_rank, ip in enumerate(node_ips): - pod = Pod() - pod.rank = node_rank - pod.addr = ip - for i in range(len(server_endpoints_ips)): - if ip == server_endpoints_ips[i]: - server = Trainer() - server.endpoint = "%s:%s" % (ip, server_endpoints_port[i]) - server.rank = server_rank - server_rank += 1 - pod.servers.append(server) - for j in range(len(worker_endpoints_ips)): - if ip == worker_endpoints_ips[j]: - worker = Trainer() - worker.endpoint = "%s:%s" % (ip, worker_endpoints_port[i]) - worker.rank = worker_rank - worker_rank += 1 - pod.workers.append(worker) - - cluster.pods.append(pod) - - pod_rank = node_ips.index(current_node_ip) - pod = cluster.pods[pod_rank] - - default_env = os.environ.copy() - current_env = copy.copy(default_env) - - gloo_rendezvous_dir = tempfile.mkdtemp() - # add gloo env - current_env["PADDLE_WITH_GLOO"] = "1" - current_env["PADDLE_GLOO_RENDEZVOUS"] = "3" - current_env["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir - - current_env.pop("http_proxy", None) - current_env.pop("https_proxy", None) - procs = [] - cmds = [] - log_fns = [] - for idx, cur_server in enumerate(pod.servers): - proc_env = { - "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints, - "PADDLE_TRAINER_ENDPOINTS": worker_endpoints, - "PADDLE_PORT": cur_server.endpoint.split(":")[1], - "TRAINING_ROLE": "PSERVER", - "PADDLE_TRAINERS_NUM": str(worker_num), - "POD_IP": cur_server.endpoint.split(":")[0] - } - current_env.update(proc_env) - - cmd = [sys.executable, "-u", args.training_script - ] + args.training_script_args - cmds.append(cmd) - - if idx == 0: - logger.info( - "Local server start {} processes. First process distributed " - "environment info (Only For Debug): {}".format( - len(pod.servers), - pretty_print_envs(proc_env, ("Distributed Envs", "Value")))) - - if args.log_dir is not None: - os.system("mkdir -p {}".format(args.log_dir)) - fn = open("%s/serverlog.%d" % (args.log_dir, idx), "w") - log_fns.append(fn) - proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn) - else: - proc = subprocess.Popen(cmd, env=current_env) - - tp = TrainerProc() - tp.proc = proc - tp.rank = cur_server.rank - tp.local_rank = idx - tp.log_fn = fn - tp.log_offset = fn.tell() if fn else None - tp.cmd = cmd - - procs.append(tp) - - for idx, cur_worker in enumerate(pod.workers): - proc_env = { - "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints, - "PADDLE_TRAINER_ENDPOINTS": worker_endpoints, - "PADDLE_TRAINERS_NUM": str(worker_num), - "TRAINING_ROLE": "TRAINER", - "PADDLE_TRAINER_ID": str(cur_worker.rank) - } - current_env.update(proc_env) - - cmd = [sys.executable, "-u", args.training_script - ] + args.training_script_args - cmds.append(cmd) - - if idx == 0: - logger.info( - "Local worker start {} processes. First process distributed " - "environment info (Only For Debug): {}".format( - len(pod.workers), - pretty_print_envs(proc_env, ("Distributed Envs", "Value")))) - - if args.log_dir is not None: - os.system("mkdir -p {}".format(args.log_dir)) - fn = open("%s/workerlog.%d" % (args.log_dir, idx), "w") - log_fns.append(fn) - proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn) - else: - proc = subprocess.Popen(cmd, env=current_env) - - tp = TrainerProc() - tp.proc = proc - tp.rank = cur_worker.rank - tp.local_rank = idx - tp.log_fn = fn - tp.log_offset = fn.tell() if fn else None - tp.cmd = cmd - - procs.append(tp) - - logger.info( - "Please check servers and workers logs in {}/workerlog.* and {}/serverlog.*". - format(args.log_dir, args.log_dir)) - # only wait worker to finish here - for i, proc in enumerate(procs): - if i < len(pod.servers): - continue - procs[i].proc.wait() - if len(log_fns) > 0: - log_fns[i].close() - - print("all workers exit, going to finish parameter server", file=sys.stderr) - for i in range(len(pod.servers)): - if len(log_fns) > 0: - log_fns[i].close() - procs[i].proc.terminate() - print("all parameter server are killed", file=sys.stderr) - - if os.path.exists(gloo_rendezvous_dir): - shutil.rmtree(gloo_rendezvous_dir) + collective_args = ['--ips'] + ps_heter_args = ["--heter_worker_num", "--heter_workers"] -def launch(): - args = _parse_args() - logger = get_logger() - _print_arguments(args) - ps_args = ['--worker_num', '--server_num', '--servers', '--workers'] - collective_args = ['--ips', '--gpus'] has_ps_args = [ ps_arg for ps_arg in ps_args if ps_arg in " ".join(sys.argv[1:-1]) ] @@ -455,23 +267,46 @@ def launch(): co_arg for co_arg in collective_args if co_arg in " ".join(sys.argv[1:-1]) ] + + if len(has_ps_args) > 1 and len(has_collective_args) > 1: + raise ValueError( + "Only one mode(Collective or Parameter-Server) can be selected at the same time, but more than one configuration was received." + ) + if fluid.core.is_compiled_with_cuda(): cuda_device_num = fluid.core.get_cuda_device_count() else: cuda_device_num = 0 - if len(has_ps_args) > 0 or cuda_device_num == 0: - logger.info("Run parameter-sever cpu mode. pserver arguments:{}".format( - has_ps_args)) - launch_ps(args) + if len(has_ps_args) > 0: + logger.info( + "Run parameter-sever mode. pserver arguments:{}, cuda count:{}". + format(has_ps_args, cuda_device_num)) + has_ps_heter_args = list(set(has_ps_args) & set(ps_heter_args)) + if len(has_ps_heter_args) > 0: + return DistributeMode.PS_HETER + else: + return DistributeMode.PS elif len(has_collective_args) > 0: logger.info("Run collective gpu mode. gpu arguments:{}, cuda count:{}". format(has_collective_args, cuda_device_num)) - launch_collective(args) + return DistributeMode.COLLECTIVE else: logger.warning( "Not found distinct arguments. Default use gpu collective mode") + return DistributeMode.COLLECTIVE + + +def launch(): + args = _parse_args() + logger = get_logger() + _print_arguments(args) + + distribute_mode = which_distributed_mode(args) + if distribute_mode == DistributeMode.COLLECTIVE: launch_collective(args) + else: + launch_ps(args, distribute_mode) if __name__ == "__main__": diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 7540cd9f4c1f3..35782e0b04c5a 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -21,13 +21,27 @@ import copy import sys import subprocess +import tempfile +import shutil from contextlib import closing import socket +import warnings +import paddle +import paddle.fluid as fluid logger = logging.getLogger("root") logger.propagate = False +class DistributeMode: + """ + There are various mode for fleetrun, each of them is designed for different model. + """ + COLLECTIVE = 0 + PS = 1 + PS_HETER = 2 + + class Cluster(object): def __init__(self, hdfs): self.job_server = None @@ -144,14 +158,16 @@ def __init__(self): self.trainers = [] self.servers = [] self.workers = [] + self.heter_workers = [] self.gpus = [] def __str__(self): return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{} servers:{} \ - workers:{}".format(self.rank, self.id, self.addr, self.port, - self.gpus, [str(t) for t in self.trainers], - [str(s) for s in self.servers], - [str(w) for w in self.workers]) + workers:{} heter_workers:{}".format( + self.rank, self.id, self.addr, self.port, self.gpus, [ + str(t) for t in self.trainers + ], [str(s) for s in self.servers], [str(w) for w in self.workers], + [str(h) for h in self.heter_workers]) def __eq__(self, pod): if self.rank != pod.rank or \ @@ -262,7 +278,7 @@ def terminate_local_procs(procs): p.log_fn.close() logger.debug("terminate process id:{}".format(p.proc.pid)) - #wait all process terminiated + # wait all process terminiated time.sleep(3) for step in range(0, 50): alive = False @@ -406,10 +422,10 @@ def start_local_trainers(cluster, else: current_env = copy.copy(envs) - #paddle broadcast ncclUniqueId use socket, and - #proxy maybe make trainers unreachable, so delete them. - #if we set them to "", grpc will log error message "bad uri" - #so just delete them. + # paddle broadcast ncclUniqueId use socket, and + # proxy maybe make trainers unreachable, so delete them. + # if we set them to "", grpc will log error message "bad uri" + # so just delete them. current_env.pop("http_proxy", None) current_env.pop("https_proxy", None) @@ -518,3 +534,524 @@ def watch_local_trainers(procs, nranks): raise return alive + + +def get_gpus(gpus): + if gpus is None: + gpus_num = fluid.core.get_cuda_device_count() + res_gpus = [str(x) for x in range(0, gpus_num)] + else: + cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") + if cuda_visible_devices is None or cuda_visible_devices == "": + res_gpus = [x.strip() for x in gpus.split(',')] + else: + # change gpus into relative values + # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.gpus=4,5,6,7; + # therefore gpus=0,1,2,3 + cuda_visible_devices_list = cuda_visible_devices.split(',') + for x in gpus.split(','): + assert x in cuda_visible_devices_list, "Can't find "\ + "your gpus %s in CUDA_VISIBLE_DEVICES[%s]."\ + % (x, cuda_visible_devices) + res_gpus = [ + cuda_visible_devices_list.index(x.strip()) + for x in gpus.split(',') + ] + logger.info("Change selected_gpus into reletive values. --ips:{} " + "will change into relative_ips:{} according to your " + "CUDA_VISIBLE_DEVICES:{}".format( + gpus, res_gpus, cuda_visible_devices_list)) + + return res_gpus + + +def direct_start(args): + # run ps-cpu mode on paddlecloud, using given envs + cmd = [sys.executable, "-u", args.training_script] + \ + args.training_script_args + proc = subprocess.Popen(cmd) + proc.wait() + return + + +def get_custom_endpoints(origin_endpoints, offset=0): + """ + origin_endpoint: ip:port + user_define_endpoint: ip:(port+offset) + """ + assert origin_endpoints != None + paddle_user_define_endpoints_list = [] + for ip_port in origin_endpoints.split(","): + ip = ip_port.split(":")[0] + port = ip_port.split(":")[1] + new_port = int(port) + offset + paddle_user_define_endpoints_list.append(":".join((ip, str(new_port)))) + paddle_user_define_endpoints = ",".join(paddle_user_define_endpoints_list) + return paddle_user_define_endpoints + + +def cloud_ps_heter_env_set(args): + environs = {} + + paddle_trainer_endpoints = os.getenv("TRAINER_IP_PORT_LIST", "") + assert paddle_trainer_endpoints != None + + paddle_pserver_endpoints = os.getenv("PSERVER_IP_PORT_LIST", "") + assert paddle_pserver_endpoints != None + + # hard code for paddlecloud custom-framework + avilable_ports = os.getenv("TRAINER_PORTS", "").split(",") + assert len( + avilable_ports + ) > 3, "set paddle_ports_num >= 2 in config.ini for paddlecloud job submit" + + # hard code for paddlecloud custom-framework + trainers_num = len(paddle_pserver_endpoints.split(",")) + assert trainers_num != 0 + environs["PADDLE_TRAINERS_NUM"] = trainers_num + environs["TRAINERS_NUM"] = trainers_num + + # hard code for paddlecloud custom-framework + environs["PADDLE_HETER_TRAINER_IP_PORT_LIST"] = paddle_trainer_endpoints + environs["PADDLE_PSERVERS_IP_PORT_LIST"] = paddle_pserver_endpoints + environs["PADDLE_TRAINER_ENDPOINTS"] = get_custom_endpoints( + paddle_pserver_endpoints, 1) + heter_worker_num = len(paddle_trainer_endpoints.split(",")) + if (args.heter_worker_num != None) and ( + heter_worker_num != args.heter_worker_num): + warnings.warn( + "Your fleetrun setting: heter_worker_num is {}, but we find {} device can be used, this setting has been changed.". + format(args.heter_worker_num, heter_worker_num)) + args.heter_worker_num = heter_worker_num + + for k, v in environs.items(): + os.environ[k] = str(v) + logger.info("Set heter parameter server env: {}".format( + pretty_print_envs(environs))) + + +class ParameterServerLauncher(object): + def __init__(self, args, distribute_mode): + self.args = args + self.distribute_mode = distribute_mode + self.server_num = 0 + self.worker_num = 0 + self.heter_worker_num = 0 + + self.server_endpoints = "" + self.server_endpoints_ips = [] + self.server_endpoints_port = [] + + self.worker_endpoints = "" + self.worker_endpoints_ips = [] + self.worker_endpoints_port = [] + + self.heter_worker_endpoints = "" + self.heter_worker_endpoints_ips = [] + self.heter_worker_endpoints_port = [] + + self.is_local = True + self.current_node_ip = "" + + self.get_role_endpoints(args) + + def get_role_endpoints(self, args): + # get server envs + if args.server_num: + self.server_num = args.server_num + if args.servers: + assert len( + args.servers.split(",") + ) == self.server_num, "The server_num and servers doesn't match. Expect servers endpoints num epual to server_num, but received servers enpoint num: {} and server_num {}".format( + len(args.servers.split(",")), self.server_num) + self.server_endpoints = args.servers + else: + ports = get_ports(self.server_num, 0) + self.server_endpoints = ",".join( + ["127.0.0.1:" + str(x) for x in ports]) + else: + assert args.servers != "", "The setting of Parameter-Server must has server_num or servers." + self.server_endpoints = args.servers + self.server_num = len(self.server_endpoints.split(",")) + + # get worker envs + if args.worker_num: + self.worker_num = args.worker_num + if args.workers: + assert len( + args.workers.split(",") + ) == self.worker_num, "The worker_num and workers doesn't match. Expect workers endpoints num epual to worker_num, but received workers enpoint num: {} and worker_num {}".format( + len(args.workers.split(",")), self.worker_num) + + self.worker_endpoints = args.workers + else: + ports = get_ports(self.worker_num, self.server_num) + self.worker_endpoints = ",".join( + ["127.0.0.1:" + str(x) for x in ports]) + else: + assert args.workers != "", "The setting of Parameter-Server must has worker_num or workers." + worker_endpoints_ips = [ + x.strip().split(":")[0] for x in args.workers.split(",") + ] + self.worker_num = len(worker_endpoints_ips) + worker_endpoints_len = [ + len(x.strip().split(":")) for x in args.workers.split(",") + ] + + if 1 in worker_endpoints_len: + # if no port value in worker_endpoints, will set default port values. + start_port = 6170 + worker_endpoints_port = range( + start_port + self.server_num, + start_port + self.server_num + self.worker_num, 1) + # create endpoints str + worker_endpoints = [] + for i in range(self.worker_num): + worker_endpoints.append(":".join((worker_endpoints_ips[ + i], str(worker_endpoints_port[i])))) + self.worker_endpoints = ",".join(worker_endpoints) + else: + self.worker_endpoints = args.workers + + # get heter worker envs + if self.distribute_mode == DistributeMode.PS_HETER: + if args.heter_worker_num: + self.heter_worker_num = args.heter_worker_num + if args.heter_workers: + assert len( + args.heter_workers.split(",") + ) == self.heter_worker_num, "The heter_worker_num and heter_workers doesn't match. Expect heter_workers endpoints num epual to heter_worker_num, but received heter_workers enpoint num: {} and heter_worker_num {}".format( + len(args.heter_workers.split(",")), + self.heter_worker_num) + self.heter_worker_endpoints = args.heter_workers + else: + ports = get_ports(self.heter_worker_num, + self.server_num + self.worker_num) + self.heter_worker_endpoints = ",".join( + ["127.0.0.1:" + str(x) for x in ports]) + else: + assert args.heter_workers != "", "The setting of Parameter-Server heter mode must has heter_worker_num or heter_workers." + self.heter_worker_endpoints = args.heter_workers + self.heter_worker_num = len( + self.heter_worker_endpoints.split(",")) + + # check local or user define + self.server_endpoints_ips = [ + x.strip().split(":")[0] for x in self.server_endpoints.split(",") + ] + self.worker_endpoints_ips = [ + x.strip().split(":")[0] for x in self.worker_endpoints.split(",") + ] + self.server_endpoints_port = [ + x.strip().split(":")[1] for x in self.server_endpoints.split(",") + ] + self.worker_endpoints_port = [ + x.strip().split(":")[1] for x in self.worker_endpoints.split(",") + ] + self.node_ips = list( + set(self.server_endpoints_ips + self.worker_endpoints_ips)) + if self.distribute_mode == DistributeMode.PS_HETER: + self.heter_worker_endpoints_ips = [ + x.strip().split(":")[0] + for x in self.heter_worker_endpoints.split(",") + ] + self.heter_worker_endpoints_port = [ + x.strip().split(":")[1] + for x in self.heter_worker_endpoints.split(",") + ] + self.node_ips = list( + set(self.node_ips + self.heter_worker_endpoints_ips)) + + if len(set(self.node_ips)) == 1: + self.is_local = True + self.current_node_ip = self.node_ips[0] + else: + self.is_local = False + pod_ip = os.getenv("POD_IP", None) + if pod_ip == None: + _, self.current_node_ip = get_host_name_ip() + else: + self.current_node_ip = pod_ip + assert self.current_node_ip in self.node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \ + % (self.current_node_ip, self.node_ips) + self.node_rank = self.node_ips.index(self.current_node_ip) + + logger.debug( + "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}". + format(self.node_ips, self.current_node_ip, self.node_rank)) + + def start_ps(self): + cluster = Cluster(hdfs=None) + server_rank = 0 + worker_rank = 0 + heter_worker_rank = 0 + + for node_rank, ip in enumerate(self.node_ips): + pod = Pod() + pod.rank = node_rank + pod.addr = ip + for i in range(len(self.server_endpoints_ips)): + if ip == self.server_endpoints_ips[i]: + server = Trainer() + server.endpoint = "%s:%s" % (ip, + self.server_endpoints_port[i]) + server.rank = server_rank + server_rank += 1 + pod.servers.append(server) + for j in range(len(self.worker_endpoints_ips)): + if ip == self.worker_endpoints_ips[j]: + worker = Trainer() + worker.endpoint = "%s:%s" % (ip, + self.worker_endpoints_port[j]) + worker.rank = worker_rank + worker_rank += 1 + pod.workers.append(worker) + for k in range(len(self.heter_worker_endpoints_ips)): + if ip == self.heter_worker_endpoints_ips[k]: + heter_worker = Trainer() + heter_worker.endpoint = "%s:%s" % ( + ip, self.heter_worker_endpoints_port[k]) + heter_worker.rank = heter_worker_rank + heter_worker_rank += 1 + pod.heter_workers.append(heter_worker) + + cluster.pods.append(pod) + + pod = cluster.pods[self.node_rank] + self.gloo_rendezvous_dir = tempfile.mkdtemp() + + # 3. subproces start + self.procs = {"worker": [], "server": [], "heter_worker": []} + self.cmds = {"worker": [], "server": [], "heter_worker": []} + self.log_fns = {"worker": [], "server": [], "heter_worker": []} + + self.start_pod_server(self.args, pod) + self.start_pod_worker(self.args, pod) + self.start_pod_heter_worker(self.args, pod) + + logger.info( + "Please check servers, workers and heter_worker logs in {}/workerlog.*, {}/serverlog.* and {}/heterlog.*". + format(self.args.log_dir, self.args.log_dir, self.args.log_dir)) + + # 4. wait for finish training + if len(self.procs["worker"]) > 0: + # if node has worker procs + # only wait worker to finish here + for i, proc in enumerate(self.procs["worker"]): + self.procs["worker"][i].proc.wait() + if len(self.log_fns["worker"]) > 0: + self.log_fns["worker"][i].close() + logger.info( + "all workers exit, going to finish parameter server and heter_worker." + ) + if len(self.procs["heter_worker"]) > 0: + for i, proc in enumerate(self.procs["heter_worker"]): + self.log_fns["heter_worker"][i].close() + self.procs["heter_worker"][i].proc.terminate() + logger.info("all heter_worker are killed") + + if len(self.procs["server"]) > 0: + for i, proc in enumerate(self.procs["server"]): + self.log_fns["server"][i].close() + self.procs["server"][i].proc.terminate() + logger.info("all parameter server are killed") + + else: + # if node has not worker procs + # blocking training process + if len(self.procs["server"]) > 0: + for i, proc in enumerate(self.procs["server"]): + self.procs["server"][i].proc.wait() + + if len(self.procs["heter_worker"]) > 0: + for i, proc in enumerate(self.procs["heter_worker"]): + self.procs["heter_worker"][i].proc.wait() + + if os.path.exists(self.gloo_rendezvous_dir): + shutil.rmtree(self.gloo_rendezvous_dir) + + def start_pod_server(self, args, pod): + default_env = os.environ.copy() + current_env = copy.copy(default_env) + current_env.pop("http_proxy", None) + current_env.pop("https_proxy", None) + for idx, cur_server in enumerate(pod.servers): + proc_env = { + "PADDLE_PSERVERS_IP_PORT_LIST": self.server_endpoints, + "PADDLE_TRAINER_ENDPOINTS": self.worker_endpoints, + "PADDLE_HETER_TRAINER_IP_PORT_LIST": + self.heter_worker_endpoints, + "PADDLE_PORT": cur_server.endpoint.split(":")[1], + "TRAINING_ROLE": "PSERVER", + "PADDLE_TRAINERS_NUM": str(self.worker_num), + "POD_IP": cur_server.endpoint.split(":")[0], + "PADDLE_WITH_GLOO": "1", + "PADDLE_GLOO_RENDEZVOUS": "2", + "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir + } + current_env.update(proc_env) + + cmd = [sys.executable, "-u", args.training_script + ] + args.training_script_args + self.cmds["server"].append(cmd) + + if idx == 0: + logger.info( + "Local server start {} processes. First process distributed " + "environment info (Only For Debug): {}".format( + len(pod.servers), + pretty_print_envs(proc_env, ("Distributed Envs", "Value" + )))) + + if args.log_dir is not None: + os.system("mkdir -p {}".format(args.log_dir)) + fn = open("%s/serverlog.%d" % (args.log_dir, idx), "w") + self.log_fns["server"].append(fn) + proc = subprocess.Popen( + cmd, env=current_env, stdout=fn, stderr=fn) + else: + proc = subprocess.Popen(cmd, env=current_env) + + tp = TrainerProc() + tp.proc = proc + tp.rank = cur_server.rank + tp.local_rank = idx + tp.log_fn = fn + tp.log_offset = fn.tell() if fn else None + tp.cmd = cmd + + self.procs["server"].append(tp) + + def start_pod_worker(self, args, pod): + default_env = os.environ.copy() + current_env = copy.copy(default_env) + current_env.pop("http_proxy", None) + current_env.pop("https_proxy", None) + + heter_device_num = 0 + device_list = [] + if fluid.core.is_compiled_with_cuda(): + device_list = get_gpus(args.gpus) + heter_device_num = len(device_list) + elif fluid.core.is_compiled_with_xpu(): + heter_device_num = fluid.core.get_xpu_device_count() + device_list = [str(x) for x in range(0, heter_device_num)] + + for idx, cur_worker in enumerate(pod.workers): + device_id = str(device_list[idx % heter_device_num]) + proc_env = { + "PADDLE_PSERVERS_IP_PORT_LIST": self.server_endpoints, + "PADDLE_TRAINER_ENDPOINTS": self.worker_endpoints, + "PADDLE_TRAINERS_NUM": str(self.worker_num), + "PADDLE_HETER_TRAINER_IP_PORT_LIST": + self.heter_worker_endpoints, + "TRAINING_ROLE": "TRAINER", + "PADDLE_TRAINER_ID": str(cur_worker.rank), + "PADDLE_WITH_GLOO": "1", + "PADDLE_GLOO_RENDEZVOUS": "2", + "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir, + "FLAGS_selected_gpus": "0", + "FLAGS_selected_xpus": "0", + "CUDA_VISIBLE_DEVICES": device_id, + "XPU_VISIBLE_DEVICES": device_id, + } + current_env.update(proc_env) + + cmd = [sys.executable, "-u", args.training_script + ] + args.training_script_args + self.cmds["worker"].append(cmd) + + if idx == 0: + logger.info( + "Local worker start {} processes. First process distributed " + "environment info (Only For Debug): {}".format( + len(pod.workers), + pretty_print_envs(proc_env, ("Distributed Envs", "Value" + )))) + + if args.log_dir is not None: + os.system("mkdir -p {}".format(args.log_dir)) + fn = open("%s/workerlog.%d" % (args.log_dir, idx), "w") + self.log_fns["worker"].append(fn) + proc = subprocess.Popen( + cmd, env=current_env, stdout=fn, stderr=fn) + else: + proc = subprocess.Popen(cmd, env=current_env) + + tp = TrainerProc() + tp.proc = proc + tp.rank = cur_worker.rank + tp.local_rank = idx + tp.log_fn = fn + tp.log_offset = fn.tell() if fn else None + tp.cmd = cmd + + self.procs["worker"].append(tp) + + def start_pod_heter_worker(self, args, pod): + default_env = os.environ.copy() + current_env = copy.copy(default_env) + current_env.pop("http_proxy", None) + current_env.pop("https_proxy", None) + + heter_device_num = 0 + device_list = [] + if fluid.core.is_compiled_with_cuda(): + device_list = get_gpus(args.gpus) + heter_device_num = len(device_list) + elif fluid.core.is_compiled_with_xpu(): + heter_device_num = fluid.core.get_xpu_device_count() + device_list = [str(x) for x in range(0, heter_device_num)] + assert heter_device_num != 0 + + for idx, cur_heter_worker in enumerate(pod.heter_workers): + device_id = str(device_list[idx % heter_device_num]) + proc_env = { + "PADDLE_PSERVERS_IP_PORT_LIST": self.server_endpoints, + "PADDLE_TRAINER_ENDPOINTS": self.worker_endpoints, + "PADDLE_HETER_TRAINER_IP_PORT_LIST": + self.heter_worker_endpoints, + "PADDLE_PORT": cur_heter_worker.endpoint.split(":")[1], + "TRAINING_ROLE": "HETER_TRAINER", + "PADDLE_TRAINERS_NUM": str(self.worker_num), + "POD_IP": cur_heter_worker.endpoint.split(":")[0], + "PADDLE_WITH_GLOO": "1", + "PADDLE_GLOO_RENDEZVOUS": "2", + "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir, + "FLAGS_selected_gpus": "0", + "FLAGS_selected_xpus": "0", + "CUDA_VISIBLE_DEVICES": device_id, + "XPU_VISIBLE_DEVICES": device_id, + } + current_env.update(proc_env) + + cmd = [sys.executable, "-u", args.training_script + ] + args.training_script_args + self.cmds["heter_worker"].append(cmd) + + if idx == 0: + logger.info( + "Local heter_worker start {} processes. First process distributed " + "environment info (Only For Debug): {}".format( + len(pod.heter_workers), + pretty_print_envs(proc_env, ("Distributed Envs", "Value" + )))) + + if args.log_dir is not None: + os.system("mkdir -p {}".format(args.log_dir)) + fn = open("%s/heterlog.%d" % (args.log_dir, idx), "w") + self.log_fns["heter_worker"].append(fn) + proc = subprocess.Popen( + cmd, env=current_env, stdout=fn, stderr=fn) + else: + proc = subprocess.Popen(cmd, env=current_env) + + tp = TrainerProc() + tp.proc = proc + tp.rank = cur_heter_worker.rank + tp.local_rank = idx + tp.log_fn = fn + tp.log_offset = fn.tell() if fn else None + tp.cmd = cmd + + self.procs["heter_worker"].append(tp) diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py index ad96e1426694f..283589c5f3320 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py @@ -19,16 +19,14 @@ class AMPOptimizer(MetaOptimizerBase): def __init__(self, optimizer): super(AMPOptimizer, self).__init__(optimizer) self.inner_opt = optimizer - self.amp_opt = None + self.wrapped_opt = None # we do not allow meta optimizer to be inner optimizer currently self.meta_optimizers_white_list = [ "LarsOptimizer", "LambOptimizer", "RecomputeOptimizer", - "LocalSGDOptimizer", "GradientMergeOptimizer", "GraphExecutionOptimizer", - "AdaptiveLocalSGDOptimizer", ] self.meta_optimizers_black_list = ["DGCOptimizer"] @@ -37,6 +35,24 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer, super(AMPOptimizer, self)._set_basic_info( loss, role_maker, user_defined_optimizer, user_defined_strategy) + def _init_wrapped_opt(self): + if self.wrapped_opt is not None: + return + + config = self.user_defined_strategy.amp_configs + + custom_white_list = set(config['custom_white_list']) + custom_black_list = set(config['custom_black_list']) + custom_black_varnames = set(config['custom_black_varnames']) + amp_lists = mixed_precision.AutoMixedPrecisionLists( + custom_white_list, custom_black_list, custom_black_varnames) + + self.wrapped_opt = mixed_precision.decorate( + self.inner_opt, amp_lists, config['init_loss_scaling'], + config['incr_every_n_steps'], config['decr_every_n_nan_or_inf'], + config['incr_ratio'], config['decr_ratio'], + config['use_dynamic_loss_scaling']) + def _can_apply(self): if not self.role_maker._is_collective: return False @@ -60,26 +76,31 @@ def _enable_strategy(self, dist_strategy, context): "use_dynamic_loss_scaling": True } + def backward(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None, + callbacks=None): + # maybe inner_opt of other meta optimizer + self._init_wrapped_opt() + return self.wrapped_opt.backward(loss, startup_program, parameter_list, + no_grad_set, callbacks) + + def apply_gradients(self, params_grads): + return self.wrapped_opt.apply_gradients(params_grads=params_grads) + + def apply_optimize(self, loss, startup_program, params_grads): + return self.wrapped_opt.apply_optimize( + loss, startup_program=startup_program, params_grads=params_grads) + def minimize_impl(self, loss, startup_program=None, parameter_list=None, no_grad_set=None): - if self.amp_opt is None: - config = self.user_defined_strategy.amp_configs - custom_white_list = set(config['custom_white_list']) - custom_black_list = set(config['custom_black_list']) - custom_black_varnames = set(config['custom_black_varnames']) - amp_lists = mixed_precision.AutoMixedPrecisionLists( - custom_white_list, custom_black_list, custom_black_varnames) - - self.amp_opt = mixed_precision.decorate( - self.inner_opt, amp_lists, config['init_loss_scaling'], - config['incr_every_n_steps'], config['decr_every_n_nan_or_inf'], - config['incr_ratio'], config['decr_ratio'], - config['use_dynamic_loss_scaling']) - + self._init_wrapped_opt() optimize_ops, params_grads = \ - self.amp_opt.minimize(loss, startup_program, + self.wrapped_opt.minimize(loss, startup_program, parameter_list, no_grad_set) return optimize_ops, params_grads diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py index 6806a479d30f4..9990021c8506a 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py @@ -85,6 +85,13 @@ def backward(self, return self.dgc_opt.backward(loss, startup_program, parameter_list, no_grad_set, callbacks) + def apply_gradients(self, params_grads): + return self.dgc_opt.apply_gradients(params_grads=params_grads) + + def apply_optimize(self, loss, startup_program, params_grads): + return self.dgc_opt.apply_optimize( + loss, startup_program=startup_program, params_grads=params_grads) + def minimize_impl(self, loss, startup_program=None, diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py index df9887759e16f..64d54ae3bab03 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py @@ -98,6 +98,10 @@ def backward(self, def apply_gradients(self, params_grads): return self.lamb_opt.apply_gradients(params_grads=params_grads) + def apply_optimize(self, loss, startup_program, params_grads): + return self.lamb_opt.apply_optimize( + loss, startup_program=startup_program, params_grads=params_grads) + def minimize_impl(self, loss, startup_program=None, diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py index 609d8b85e714c..32c6be505a546 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py @@ -85,6 +85,10 @@ def backward(self, def apply_gradients(self, params_grads): return self.lars_opt.apply_gradients(params_grads=params_grads) + def apply_optimize(self, loss, startup_program, params_grads): + return self.lars_opt.apply_optimize( + loss, startup_program=startup_program, params_grads=params_grads) + def minimize_impl(self, loss, startup_program=None, diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py index 9f094978d842a..91030f0762934 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py @@ -24,7 +24,7 @@ class LocalSGDOptimizer(MetaOptimizerBase): def __init__(self, optimizer): super(LocalSGDOptimizer, self).__init__(optimizer) self.inner_opt = optimizer - self.meta_optimizers_white_list = [] + self.meta_optimizers_white_list = ['AMPOptimizer'] self.meta_optimizers_black_list = [ "GraphExecutionOptimizer", "AdaptiveLocalSGDOptimizer", @@ -195,7 +195,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase): def __init__(self, optimizer): super(AdaptiveLocalSGDOptimizer, self).__init__(optimizer) self.inner_opt = optimizer - self.meta_optimizers_white_list = [] + self.meta_optimizers_white_list = ['AMPOptimizer'] self.meta_optimizers_black_list = [ "GraphExecutionOptimizer", "LocalSGDOptimizer" ] diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py index 38ad41f8836b4..83345cb6f623e 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py @@ -74,6 +74,8 @@ def _build_trainer_programs(self, compiled_config): _startup = worker.delet_extra_optimizes_pass(_startup, compiled_config) + compiled_config.set_origin_ps_main_program(_main) + compiled_config.set_origin_ps_startup_program(_startup) # for heter program if self.role_maker._is_heter_parameter_server_mode: from paddle.fluid.incubate.fleet.parameter_server.ir import heter_trainer_pass as heter_worker @@ -91,6 +93,8 @@ def _build_trainer_programs(self, compiled_config): else: _main = worker.append_send_ops_pass(_main, compiled_config) _startup = _startup + compiled_config.set_origin_ps_main_program(_main) + compiled_config.set_origin_ps_startup_program(_startup) return _main, _startup diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py index 59ca7e633099e..ea2b67ac4bd1f 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py @@ -18,15 +18,14 @@ class RecomputeOptimizer(MetaOptimizerBase): def __init__(self, optimizer): super(RecomputeOptimizer, self).__init__(optimizer) - #self.inner_opt = RO(optimizer) self.inner_opt = optimizer - self.wrapped_opt = RO(optimizer) + self.wrapped_opt = None # we do not allow meta optimizer to be inner optimizer currently self.meta_optimizers_white_list = [ "LarsOptimizer", "LambOptimizer", - "GradientMergeOptimizer", "GraphExecutionOptimizer", + "DGCOptimizer", ] self.meta_optimizers_black_list = [] @@ -34,8 +33,15 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer, user_defined_strategy): super(RecomputeOptimizer, self)._set_basic_info( loss, role_maker, user_defined_optimizer, user_defined_strategy) - self.wrapped_opt._set_checkpoints( - list(user_defined_strategy.recompute_configs["checkpoints"])) + + def _init_wrapped_opt(self): + if self.wrapped_opt is not None: + return + + configs = self.user_defined_strategy.recompute_configs + + self.wrapped_opt = RO(self.inner_opt) + self.wrapped_opt._set_checkpoints(list(configs["checkpoints"])) def _can_apply(self): if not self.role_maker._is_collective: @@ -62,14 +68,24 @@ def backward(self, parameter_list=None, no_grad_set=None, callbacks=None): + # maybe inner_opt of other meta optimizer + self._init_wrapped_opt() return self.wrapped_opt.backward(loss, startup_program, parameter_list, no_grad_set, callbacks) + def apply_gradients(self, params_grads): + return self.wrapped_opt.apply_gradients(params_grads=params_grads) + + def apply_optimize(self, loss, startup_program, params_grads): + return self.wrapped_opt.apply_optimize( + loss, startup_program=startup_program, params_grads=params_grads) + def minimize_impl(self, loss, startup_program=None, parameter_list=None, no_grad_set=None): + self._init_wrapped_opt() optimize_ops, params_grads = \ self.wrapped_opt.minimize(loss, startup_program, parameter_list, no_grad_set) diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py index 42be7e869d9a7..266c7d0f405bf 100644 --- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py +++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py @@ -210,18 +210,23 @@ def get_sparse_attrs(): warnings.warn("communicator has been initialized, skip") def _get_executor(self): - if self.role_maker._is_heter_worker(): - if self.role_maker._get_heter_worker_device() == "GPU": - gpu_id = int(os.getenv("FLAGS_selected_gpus", "0")) - executor = Executor(fluid.CUDAPlace(gpu_id)) - elif self.role_maker._get_heter_worker_device() == "XPU": - xpu_id = int(os.getenv("FLAGS_selected_xpus", "0")) - executor = Executor(fluid.XPUPlace(xpu_id)) - else: - raise ValueError("Not Support Device {}".format( - self.role_maker._get_heter_worker_device())) - else: - executor = fluid.Executor(fluid.CPUPlace()) + executor = fluid.Executor(fluid.CPUPlace()) + if self.role_maker._is_heter_parameter_server_mode: + heter_worker_device_guard = self.context[ + "valid_strategy"].a_sync_configs[ + "heter_worker_device_guard"].upper() + if heter_worker_device_guard not in ["GPU", "XPU", "CPU"]: + raise ValueError("Heter Worker Not Support Device {}".format( + heter_worker_device_guard)) + if self.role_maker._is_heter_worker(): + if heter_worker_device_guard == "GPU": + executor = Executor( + fluid.CUDAPlace( + int(os.getenv("FLAGS_selected_gpus", "0")))) + elif heter_worker_device_guard == "XPU": + executor = Executor( + fluid.XPUPlace( + int(os.getenv("FLAGS_selected_xpus", "0")))) return executor def _init_server(self, *args, **kwargs): @@ -233,12 +238,14 @@ def _init_server(self, *args, **kwargs): model_dirname = None executor = self._get_executor() + if self.role_maker._is_heter_worker() and self.context[ + "valid_strategy"].a_sync_configs["launch_barrier"]: + # for heter trainer wait server ready + wait_server_ready(self.role_maker._get_pserver_endpoints()) executor.run(fluid.default_startup_program()) if self.role_maker._is_heter_worker(): self._init_worker() - - if self.role_maker._is_heter_worker(): return if not model_dirname: @@ -470,13 +477,13 @@ def _save_distributed_params(self, executor, dirname, context, def _save_distributed_persistables(self, executor, dirname, main_program): dense_ctx = self.compiled_strategy.get_communicator_recv_context( - recv_type=1) + recv_type=1, use_origin_program=True) sparse_ctx = self.compiled_strategy.get_communicator_recv_context( - recv_type=2) + recv_type=2, use_origin_program=True) distributed_ctx = self.compiled_strategy.get_communicator_recv_context( - recv_type=3) + recv_type=3, use_origin_program=True) recv_dense_varnames = self._save_dense_params(executor, dirname, dense_ctx, main_program) @@ -528,7 +535,7 @@ def _ps_inference_save_persistables(self, ) if main_program is None: - main_program = fluid.default_main_program() + main_program = self.compiled_strategy.get_origin_ps_main_program() if isinstance(main_program, CompiledProgram): raise TypeError( diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py index 35204affb3fd1..ff3e882229ae8 100644 --- a/python/paddle/distribution.py +++ b/python/paddle/distribution.py @@ -28,13 +28,14 @@ from .fluid import core from .fluid.framework import in_dygraph_mode from .tensor.math import elementwise_mul, elementwise_div, elementwise_add, elementwise_sub +from .tensor import arange, gather_nd, concat, multinomial import math import numpy as np import warnings from .fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype -__all__ = ['Distribution', 'Uniform', 'Normal'] +__all__ = ['Distribution', 'Uniform', 'Normal', 'Categorical'] class Distribution(object): @@ -640,3 +641,318 @@ def kl_divergence(self, other): t1 = (t1 * t1) return elementwise_add( 0.5 * var_ratio, 0.5 * (t1 - 1. - nn.log(var_ratio)), name=name) + + +class Categorical(Distribution): + """ + Categorical distribution is a discrete probability distribution that + describes the possible results of a random variable that can take on + one of K possible categories, with the probability of each category + separately specified. + + The probability mass function (pmf) is: + + .. math:: + + pmf(k; p_i) = \prod_{i=1}^{k} p_i^{[x=i]} + + In the above equation: + + * :math:`[x=i]` : it evaluates to 1 if :math:`x==i` , 0 otherwise. + + Args: + logits(list|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64. + + Examples: + .. code-block:: python + + import paddle + from paddle.distribution import Categorical + + x = paddle.rand([6]) + print(x.numpy()) + # [0.32564053, 0.99334985, 0.99034804, + # 0.09053693, 0.30820143, 0.19095989] + y = paddle.rand([6]) + print(y.numpy()) + # [0.6365463 , 0.7278677 , 0.90260243, + # 0.5226815 , 0.35837543, 0.13981032] + + cat = Categorical(x) + cat2 = Categorical(y) + + cat.sample([2,3]) + # [[5, 1, 1], + # [0, 1, 2]] + + cat.entropy() + # [1.71887] + + cat.kl_divergence(cat2) + # [0.0278455] + + value = paddle.to_tensor([2,1,3]) + cat.probs(value) + # [0.341613 0.342648 0.03123] + + cat.log_prob(value) + # [-1.07408 -1.07105 -3.46638] + + """ + + def __init__(self, logits, name=None): + """ + Args: + logits(list|numpy.ndarray|Variable): The logits input of categorical distribution. The data type is float32 or float64. + """ + if not in_dygraph_mode(): + check_type(logits, 'logits', (np.ndarray, tensor.Variable, list), + 'Categorical') + + self.name = name if name is not None else 'Categorical' + self.dtype = 'float32' + + if self._validate_args(logits): + self.logits = logits + self.dtype = convert_dtype(logits.dtype) + else: + if isinstance(logits, np.ndarray) and str( + logits.dtype) in ['float32', 'float64']: + self.dtype = logits.dtype + self.logits = self._to_tensor(logits)[0] + if self.dtype != convert_dtype(self.logits.dtype): + self.logits = tensor.cast(self.logits, dtype=self.dtype) + + def sample(self, shape): + """Generate samples of the specified shape. + + Args: + shape (list): Shape of the generated samples. + + Returns: + Tensor: A tensor with prepended dimensions shape. + + Examples: + .. code-block:: python + + import paddle + from paddle.distribution import Categorical + + x = paddle.rand([6]) + print(x.numpy()) + # [0.32564053, 0.99334985, 0.99034804, + # 0.09053693, 0.30820143, 0.19095989] + + cat = Categorical(x) + + cat.sample([2,3]) + # [[5, 1, 1], + # [0, 1, 2]] + + """ + name = self.name + '_sample' + if not in_dygraph_mode(): + check_type(shape, 'shape', (list), 'sample') + + num_samples = np.prod(np.array(shape)) + + logits_shape = list(self.logits.shape) + if len(logits_shape) > 1: + sample_shape = shape + logits_shape[:-1] + logits = nn.reshape(self.logits, + [np.prod(logits_shape[:-1]), logits_shape[-1]]) + else: + sample_shape = shape + logits = self.logits + + sample_index = multinomial(logits, num_samples, True) + return nn.reshape(sample_index, sample_shape, name=name) + + def kl_divergence(self, other): + """The KL-divergence between two Categorical distributions. + + Args: + other (Categorical): instance of Categorical. The data type is float32. + + Returns: + Variable: kl-divergence between two Categorical distributions. + + Examples: + .. code-block:: python + + import paddle + from paddle.distribution import Categorical + + x = paddle.rand([6]) + print(x.numpy()) + # [0.32564053, 0.99334985, 0.99034804, + # 0.09053693, 0.30820143, 0.19095989] + y = paddle.rand([6]) + print(y.numpy()) + # [0.6365463 , 0.7278677 , 0.90260243, + # 0.5226815 , 0.35837543, 0.13981032] + + cat = Categorical(x) + cat2 = Categorical(y) + + cat.kl_divergence(cat2) + # [0.0278455] + + """ + name = self.name + '_kl_divergence' + if not in_dygraph_mode(): + check_type(other, 'other', Categorical, 'kl_divergence') + + logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True) + other_logits = other.logits - nn.reduce_max( + other.logits, dim=-1, keep_dim=True) + e_logits = ops.exp(logits) + other_e_logits = ops.exp(other_logits) + z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True) + other_z = nn.reduce_sum(other_e_logits, dim=-1, keep_dim=True) + prob = e_logits / z + kl = nn.reduce_sum( + prob * (logits - nn.log(z) - other_logits + nn.log(other_z)), + dim=-1, + keep_dim=True, + name=name) + + return kl + + def entropy(self): + """Shannon entropy in nats. + + Returns: + Variable: Shannon entropy of Categorical distribution. The data type is float32. + + Examples: + .. code-block:: python + + import paddle + from paddle.distribution import Categorical + + x = paddle.rand([6]) + print(x.numpy()) + # [0.32564053, 0.99334985, 0.99034804, + # 0.09053693, 0.30820143, 0.19095989] + + cat = Categorical(x) + + cat.entropy() + # [1.71887] + + """ + name = self.name + '_entropy' + logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True) + e_logits = ops.exp(logits) + z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True) + prob = e_logits / z + + neg_entropy = nn.reduce_sum( + prob * (logits - nn.log(z)), dim=-1, keep_dim=True) + entropy = nn.scale(neg_entropy, scale=-1.0, name=name) + return entropy + + def probs(self, value): + """Probabilities of the given category (``value``). + + If ``logits`` is 2-D or higher dimension, the last dimension will be regarded as + category, and the others represents the different distributions. + At the same time, if ``vlaue`` is 1-D Tensor, ``value`` will be broadcast to the + same number of distributions as ``logits``. + If ``value`` is not 1-D Tensor, ``value`` should have the same number distributions + with ``logits. That is, ``value[:-1] = logits[:-1]``. + + Args: + value (Tensor): The input tensor represents the selected category index. + + Returns: + Tensor: probability according to the category index. + + Examples: + .. code-block:: python + + import paddle + from paddle.distribution import Categorical + + x = paddle.rand([6]) + print(x.numpy()) + # [0.32564053, 0.99334985, 0.99034804, + # 0.09053693, 0.30820143, 0.19095989] + + cat = Categorical(x) + + value = paddle.to_tensor([2,1,3]) + cat.probs(value) + # [0.341613 0.342648 0.03123] + + """ + name = self.name + '_probs' + + dist_sum = nn.reduce_sum(self.logits, dim=-1, keep_dim=True) + prob = self.logits / dist_sum + + shape = list(prob.shape) + value_shape = list(value.shape) + if len(shape) == 1: + num_value_in_one_dist = np.prod(value_shape) + index_value = nn.reshape(value, [num_value_in_one_dist, 1]) + index = index_value + else: + num_dist = np.prod(shape[:-1]) + num_value_in_one_dist = value_shape[-1] + prob = nn.reshape(prob, [num_dist, shape[-1]]) + if len(value_shape) == 1: + value = nn.expand(value, [num_dist]) + value_shape = shape[:-1] + value_shape + index_value = nn.reshape(value, [num_dist, -1, 1]) + if shape[:-1] != value_shape[:-1]: + raise ValueError( + "shape of value {} must match shape of logits {}".format( + str(value_shape[:-1]), str(shape[:-1]))) + + index_prefix = nn.unsqueeze( + arange( + num_dist, dtype=index_value.dtype), axes=-1) + index_prefix = nn.expand(index_prefix, [1, num_value_in_one_dist]) + index_prefix = nn.unsqueeze(index_prefix, axes=-1) + + if index_value.dtype != index_prefix.dtype: + tensor.cast(index_prefix, dtype=index_value.dtype) + index = concat([index_prefix, index_value], axis=-1) + + # value is the category index to search for the corresponding probability. + select_prob = gather_nd(prob, index) + return nn.reshape(select_prob, value_shape, name=name) + + def log_prob(self, value): + """Log probabilities of the given category. Refer to ``probs`` method. + + Args: + value (Tensor): The input tensor represents the selected category index. + + Returns: + Tensor: Log probability. + + Examples: + .. code-block:: python + + import paddle + from paddle.distribution import Categorical + + x = paddle.rand([6]) + print(x.numpy()) + # [0.32564053, 0.99334985, 0.99034804, + # 0.09053693, 0.30820143, 0.19095989] + + cat = Categorical(x) + + value = paddle.to_tensor([2,1,3]) + + cat.log_prob(value) + # [-1.07408 -1.07105 -3.46638] + + """ + name = self.name + '_log_prob' + + return nn.log(self.probs(value), name=name) diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 0e7a9dbea2561..505d6fef8fb53 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -26,8 +26,8 @@ from .dygraph import base as imperative_base __all__ = [ - 'set_gradient_clip', 'ErrorClipByValue', 'GradientClipByValue', - 'GradientClipByNorm', 'GradientClipByGlobalNorm' + 'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue', + 'ClipGradByNorm', 'ClipGradByGlobalNorm' ] @@ -115,16 +115,9 @@ def error_clip_callback(block, context): error_clip._append_clip_op(block, grad_n) -class GradientClipBase(object): - def __init__(self, need_clip=None): - if need_clip is not None and not callable(need_clip): - raise TypeError( - "The type of need_clip must be funciton, and it can filter out " - "parameter that does't need gradient clip. This function must return " - "True or False, and True means that clipping is required. Please refer to " - "API documention of GradientClipByGlobalNorm / GradientClipByNorm " - "/GradientClipByValue.") - self._need_clip_func = need_clip +class ClipGradBase(object): + def __init__(self): + super(ClipGradBase, self).__init__() def __str__(self): raise NotImplementedError() @@ -144,7 +137,7 @@ def __call__(self, params_grads): if getattr(p, 'gradient_clip_attr', None) is not None: warnings.warn( "'set_gradient_clip' will be ineffective, because you have " - "set 'grad_clip' in 'optimizer'. So, 'set_gradient_clip' " + "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' " "is redundant and you can remove it.") break return self._static_clip(params_grads) @@ -156,7 +149,7 @@ def _create_operators(self, param, grad): raise NotImplementedError() -class GradientClipByValue(GradientClipBase): +class ClipGradByValue(ClipGradBase): """ Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max]. @@ -164,19 +157,20 @@ class GradientClipByValue(GradientClipBase): - Any values greater than max are set to ``max``. - The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip`` - is not None, then only part of gradients can be selected for gradient clipping. + The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``. + If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped. Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` (for example: :ref:`api_paddle_optimizer_SGD`). + + Note: + ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0. + Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope. Args: max (float): The maximum value to clip by. min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max`` automatically. In this case, ``max`` must be greater than 0. - need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool`` - (True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None, - and gradients of all parameters in the network will be clipped. Examples: .. code-block:: python @@ -184,29 +178,20 @@ class GradientClipByValue(GradientClipBase): import paddle x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32') - linear = paddle.nn.Linear(10, 10) + linear = paddle.nn.Linear(in_features=10, out_features=10, + weight_attr=paddle.ParamAttr(need_clip=True), + bias_attr=paddle.ParamAttr(need_clip=False)) out = linear(x) loss = paddle.mean(out) loss.backward() - # clip all parameters in network: - clip = paddle.nn.GradientClipByValue(min=-1, max=1) - - # clip a part of parameters in network: (e.g. linear_0.w_0) - # pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool - # def fileter_func(ParamBase): - # # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0) - # return ParamBase.name == "linear_0.w_0" - # # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter - # return ParamBase.name == linear.weight.name - # clip = paddle.nn.GradientClipByValue(min=-1, max=1, need_clip=fileter_func) - + clip = paddle.nn.ClipGradByValue(min=-1, max=1) sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip) sdg.step() """ - def __init__(self, max, min=None, need_clip=None): - super(GradientClipByValue, self).__init__(need_clip) + def __init__(self, max, min=None): + super(ClipGradByValue, self).__init__() if min is None: assert (max > 0.0) min = -max @@ -214,7 +199,7 @@ def __init__(self, max, min=None, need_clip=None): self.min = float(min) def __str__(self): - return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max) + return "Clip Gradient By Value, min = %f, max=%f" % (self.min, self.max) @imperative_base.no_grad def _dygraph_clip(self, params_grads): @@ -222,7 +207,7 @@ def _dygraph_clip(self, params_grads): for p, g in params_grads: if g is None: continue - if self._need_clip_func is not None and not self._need_clip_func(p): + if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue new_grad = layers.clip(x=g, min=self.min, max=self.max) @@ -236,8 +221,7 @@ def _static_clip(self, params_grads): for p, g in params_grads: if g is None: continue - if self._need_clip_func is not None and not self._need_clip_func( - p): + if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue @@ -256,7 +240,7 @@ def _create_operators(self, param, grad): return param, new_grad -class GradientClipByNorm(GradientClipBase): +class ClipGradByNorm(ClipGradBase): """ Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` . @@ -264,8 +248,8 @@ class GradientClipByNorm(GradientClipBase): - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done. - The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip`` - is not None, then only part of gradients can be selected for gradient clipping. + The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``. + If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped. Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` (for example: :ref:`api_paddle_optimizer_SGD`). @@ -287,11 +271,12 @@ class GradientClipByNorm(GradientClipBase): .. math:: norm(X) = ( \\sum_{i=1}^{n}|x\_i|^2)^{ \\frac{1}{2}} + Note: + ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0. + Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope. + Args: clip_norm(float): The maximum norm value. - need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool`` - (True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None, - and gradients of all parameters in the network will be clipped. Examples: .. code-block:: python @@ -299,29 +284,20 @@ class GradientClipByNorm(GradientClipBase): import paddle x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32') - linear = paddle.nn.Linear(10, 10) + linear = paddle.nn.Linear(in_features=10, out_features=10, + weight_attr=paddle.ParamAttr(need_clip=True), + bias_attr=paddle.ParamAttr(need_clip=False)) out = linear(x) loss = paddle.mean(out) loss.backward() - # clip all parameters in network: - clip = paddle.nn.GradientClipByNorm(clip_norm=1.0) - - # clip a part of parameters in network: (e.g. linear_0.w_0) - # pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool - # def fileter_func(ParamBase): - # # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0) - # return ParamBase.name == "linear_0.w_0" - # # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter - # return ParamBase.name == linear.weight.name - # clip = paddle.nn.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func) - + clip = paddle.nn.ClipGradByNorm(clip_norm=1.0) sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip) sdg.step() """ - def __init__(self, clip_norm, need_clip=None): - super(GradientClipByNorm, self).__init__(need_clip) + def __init__(self, clip_norm): + super(ClipGradByNorm, self).__init__() self.clip_norm = float(clip_norm) def __str__(self): @@ -333,7 +309,7 @@ def _dygraph_clip(self, params_grads): for p, g in params_grads: if g is None: continue - if self._need_clip_func is not None and not self._need_clip_func(p): + if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm) @@ -347,8 +323,7 @@ def _static_clip(self, params_grads): for p, g in params_grads: if g is None: continue - if self._need_clip_func is not None and not self._need_clip_func( - p): + if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue @@ -367,7 +342,7 @@ def _create_operators(self, param, grad): return param, new_grad -class GradientClipByGlobalNorm(GradientClipBase): +class ClipGradByGlobalNorm(ClipGradBase): """ Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in :math:`t\_list` , and limit it to ``clip_norm`` . @@ -376,8 +351,8 @@ class GradientClipByGlobalNorm(GradientClipBase): - If the global norm is less than or equal to ``clip_norm`` , nothing will be done. - The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip`` - is not None, then only part of gradients can be selected for gradient clipping. + The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``. + If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped. Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` (for example: :ref:`api_paddle_optimizer_SGD`). @@ -394,12 +369,13 @@ class GradientClipByGlobalNorm(GradientClipBase): global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2} + Note: + ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0. + Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope. + Args: clip_norm (float): The maximum norm value. - group_name (str, optional): The group name for this clip. Default value is ``default_group`` - need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool`` - (True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None, - and gradients of all parameters in the network will be clipped. + group_name (str, optional): The group name for this clip. Default value is ``default_group``. Examples: .. code-block:: python @@ -407,29 +383,20 @@ class GradientClipByGlobalNorm(GradientClipBase): import paddle x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32') - linear = paddle.nn.Linear(10, 10) + linear = paddle.nn.Linear(in_features=10, out_features=10, + weight_attr=paddle.ParamAttr(need_clip=True), + bias_attr=paddle.ParamAttr(need_clip=False)) out = linear(x) loss = paddle.mean(out) loss.backward() - # clip all parameters in network: - clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0) - - # clip a part of parameters in network: (e.g. linear_0.w_0) - # pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool - # def fileter_func(ParamBase): - # # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0) - # return ParamBase.name == "linear_0.w_0" - # # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter - # return ParamBase.name == linear.weight.name - # clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func) - + clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip) sdg.step() """ - def __init__(self, clip_norm, group_name="default_group", need_clip=None): - super(GradientClipByGlobalNorm, self).__init__(need_clip) + def __init__(self, clip_norm, group_name="default_group"): + super(ClipGradByGlobalNorm, self).__init__() self.clip_norm = float(clip_norm) self.group_name = group_name @@ -443,7 +410,7 @@ def _dygraph_clip(self, params_grads): for p, g in params_grads: if g is None: continue - if self._need_clip_func is not None and not self._need_clip_func(p): + if getattr(p, 'need_clip', True) is False: continue merge_grad = g if g.type == core.VarDesc.VarType.SELECTED_ROWS: @@ -469,7 +436,7 @@ def _dygraph_clip(self, params_grads): for p, g in params_grads: if g is None: continue - if self._need_clip_func is not None and not self._need_clip_func(p): + if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue new_grad = layers.elementwise_mul(x=g, y=clip_var) @@ -484,8 +451,7 @@ def _static_clip(self, params_grads): for p, g in params_grads: if g is None: continue - if self._need_clip_func is not None and not self._need_clip_func( - p): + if getattr(p, 'need_clip', True) is False: continue merge_grad = g with p.block.program._optimized_guard([p, g]): @@ -518,8 +484,7 @@ def _static_clip(self, params_grads): for p, g in params_grads: if g is None: continue - if self._need_clip_func is not None and not self._need_clip_func( - p): + if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue @@ -670,9 +635,9 @@ def network(): "This method can reduce the mistakes, please " "refer to documention of 'optimizer'.") - if not isinstance(clip, GradientClipBase): + if not isinstance(clip, ClipGradBase): raise TypeError( - "'clip' should be an instance of GradientClipBase's derived class") + "'clip' should be an instance of ClipGradBase's derived class") if program is None: program = framework.default_main_program() @@ -708,7 +673,7 @@ def append_gradient_clip_ops(param_grads): clip_attr = getattr(p, 'gradient_clip_attr', None) if clip_attr is None: return param_grads - if not isinstance(clip_attr, GradientClipBase): + if not isinstance(clip_attr, ClipGradBase): raise TypeError( "clip attribute should be an instance of GradientClipBase") @@ -754,6 +719,7 @@ def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict): op._set_attr('op_role_var', correct_p_g) -ClipByValue = GradientClipByValue -ClipByNorm = GradientClipByNorm -ClipByGlobalNorm = GradientClipByGlobalNorm +GradientClipBase = ClipGradBase +GradientClipByValue = ClipGradByValue +GradientClipByNorm = ClipGradByNorm +GradientClipByGlobalNorm = ClipGradByGlobalNorm diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py index ac6493b1c2969..d0543bb90dd14 100644 --- a/python/paddle/fluid/contrib/layers/nn.py +++ b/python/paddle/fluid/contrib/layers/nn.py @@ -1525,10 +1525,10 @@ def bilateral_slice(x, guide, grid, has_offset, name=None): grid = fluid.data(name='grid', shape=[None, 12, 8, 10, 6], dtype='float32') # without offset - output = fluid.layers.bilateral_slice(x, guide, grid, has_offset=False) + output = fluid.contrib.bilateral_slice(x, guide, grid, has_offset=False) # has offset - output = fluid.layers.bilateral_slice(x, guide, grid, has_offset=True) + output = fluid.contrib.bilateral_slice(x, guide, grid, has_offset=True) """ helper = LayerHelper("bilateral_slice", **locals()) @@ -1541,7 +1541,9 @@ def bilateral_slice(x, guide, grid, has_offset, name=None): out = helper.create_variable_for_type_inference(x.dtype) inputs = {'X': x, 'Guide': guide, 'Grid': grid} - + if paddle.fluid.in_dygraph_mode(): + attrs = ('has_offset', has_offset) + return getattr(core.ops, "bilateral_slice")(x, grid, guide, *attrs) helper.append_op( type='bilateral_slice', inputs=inputs, diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py index c9112ac849ce0..529c664e7083c 100644 --- a/python/paddle/fluid/contrib/mixed_precision/decorator.py +++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py @@ -16,6 +16,7 @@ from ... import default_startup_program from ... import layers from ... import unique_name +from ... import program_guard from . import fp16_utils from .fp16_utils import rewrite_program from .fp16_utils import update_role_var_grad @@ -58,21 +59,40 @@ def __init__(self, optimizer, amp_lists, init_loss_scaling, self._optimizer = optimizer self._amp_lists = amp_lists self._param_grads = None - self._train_program = default_main_program() - self._startup_prog = default_startup_program() + self._train_program = None + self._scaled_loss = None - self._loss_scaling = layers.create_global_var( - name=unique_name.generate("loss_scaling"), - shape=[1], - value=init_loss_scaling, - dtype='float32', - persistable=True) + self._loss_scaling = None + self._init_loss_scaling = init_loss_scaling self._use_dynamic_loss_scaling = use_dynamic_loss_scaling if self._use_dynamic_loss_scaling: self._incr_every_n_steps = incr_every_n_steps self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf self._incr_ratio = incr_ratio self._decr_ratio = decr_ratio + self._num_good_steps = None + self._num_bad_steps = None + + def get_loss_scaling(self): + """Return the real-time loss scaling factor. + """ + return self._loss_scaling + + def get_scaled_loss(self): + """Return the scaled loss. + It's useful when you feed customed loss into executor. + """ + return self._scaled_loss + + def _init_amp_var(self): + self._loss_scaling = layers.create_global_var( + name=unique_name.generate("loss_scaling"), + shape=[1], + value=self._init_loss_scaling, + dtype='float32', + persistable=True) + + if self._use_dynamic_loss_scaling: self._num_good_steps = layers.create_global_var( name=unique_name.generate("num_good_steps"), shape=[1], @@ -86,28 +106,16 @@ def __init__(self, optimizer, amp_lists, init_loss_scaling, dtype='int32', persistable=True) - # Ensure the data type of learning rate vars is float32 (same as the + # Ensure the data type of learning rate vars is float32 (same as the # master parameter dtype) - if isinstance(optimizer._learning_rate, float): - optimizer._learning_rate_map[default_main_program()] = \ - layers.create_global_var( - name=unique_name.generate("learning_rate"), - shape=[1], - value=float(optimizer._learning_rate), - dtype='float32', - persistable=True) - - def get_loss_scaling(self): - """Return the real-time loss scaling factor. - """ - return self._loss_scaling - - def get_scaled_loss(self): - """Return the scaled loss. - It's useful when you feed customed loss into executor. - """ - - return self._scaled_loss + if isinstance(self._optimizer._learning_rate, float): + self._optimizer._learning_rate_map[default_main_program()] = \ + layers.create_global_var( + name=unique_name.generate("learning_rate"), + shape=[1], + value=float(self._optimizer._learning_rate), + dtype='float32', + persistable=True) def backward(self, loss, @@ -131,16 +139,21 @@ def backward(self, A list of (param, grad), which is a tuple of a parameter and its gradient respectively, and the scaled loss. """ - rewrite_program(self._train_program, self._amp_lists) - self._scaled_loss = loss * self._loss_scaling - self._params_grads = self._optimizer.backward( - self._scaled_loss, startup_program, parameter_list, no_grad_set, - callbacks) - # Change the op_role_var attr for some ops, so that gradients - # transferred across GPUs can be FP16. - update_role_var_grad(self._train_program, self._params_grads) - - return self._params_grads + train_program = loss.block.program + self._train_program = train_program + + with program_guard(train_program, startup_program): + self._init_amp_var() + + rewrite_program(train_program, self._amp_lists) + self._scaled_loss = loss * self._loss_scaling + params_grads = self._optimizer.backward( + self._scaled_loss, startup_program, parameter_list, no_grad_set, + callbacks) + # Change the op_role_var attr for some ops, so that gradients + # transferred across GPUs can be FP16. + update_role_var_grad(train_program, params_grads) + return params_grads def apply_gradients(self, params_grads): """ @@ -182,6 +195,12 @@ def apply_gradients(self, params_grads): return optimize_ops + def apply_optimize(self, loss, startup_program, params_grads): + program = loss.block.program + with program_guard(program, startup_program): + optimize_ops = self.apply_gradients(params_grads) + return optimize_ops + def minimize(self, loss, startup_program=None, @@ -207,7 +226,8 @@ def minimize(self, parameter_list=parameter_list, no_grad_set=no_grad_set) - optimize_ops = self.apply_gradients(scaled_params_grads) + optimize_ops = self.apply_optimize(loss, startup_program, + scaled_params_grads) return optimize_ops, scaled_params_grads diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py index dadc756c43ecc..45df381b63183 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py @@ -66,6 +66,7 @@ def __init__(self, self._fc_ops = ['fc'] self._relu_ops = ['relu', 'relu6'] self._matmul_ops = ['matmul'] + self._gru_ops = ['fusion_gru'] self._weight_scales = {} # Collect the Input and Output sclaes from Fake quant models self._var_quant_scales = {} @@ -449,8 +450,43 @@ def _compute_var_scales(ops, w_name, axis): self._var_quant_scales[weight_var_name] = (use_unsigned_int, lod_tensor) + def _compute_gru_weight_scales(wx_name, wh_name): + for op in graph.all_op_nodes(): + if op.op().type() in self._gru_ops: + wx_var_name = op.input(wx_name)[0] + wh_var_name = op.input(wh_name)[0] + wx = np.array(self._load_param(self._scope, wx_var_name)) + wh = np.array(self._load_param(self._scope, wh_var_name)) + OC = wh.shape[0] + scale_ur = 1.0 / np.max(np.abs( + np.concatenate( + [ + wx[:, :2 * OC], wh.flatten()[:2 * OC * OC] + .reshape(OC, 2 * OC) + ], + axis=0)), + axis=0) + scale_o = 1.0 / np.max(np.abs( + np.concatenate( + [ + wx[:, 2 * OC:], wh.flatten()[2 * OC * OC:] + .reshape(OC, OC) + ], + axis=0)), + axis=0) + + gru_weights_scale = np.concatenate( + [scale_ur, scale_o]).astype('float') + + lod_tensor = self._convert_scale2tensor(gru_weights_scale) + use_unsigned_int = False + self._var_quant_scales[wx_var_name] = (use_unsigned_int, + lod_tensor) + _compute_var_scales(self._conv_ops, "Filter", axis=1) _compute_var_scales(self._fc_ops, "W", axis=0) + _compute_var_scales(self._gru_ops, "WeightH", axis=0) + _compute_gru_weight_scales("WeightX", "WeightH") return graph def _find_avg_pooling_ids(self, graph): diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index b5a8d90194331..eba881a2637ae 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -758,6 +758,7 @@ def _insert_channel_quant_op(self, graph, var_node, name, quant_bits, attrs={ 'bit_length': quant_bits, 'quant_axis': quant_axis, + 'is_test': self._is_test, 'op_role': core.op_proto_and_checker_maker.OpRole.Forward }, inputs={'X': var_node}, @@ -1125,7 +1126,7 @@ def apply(self, graph): self._restore_var(input_arg_name, quantized_param_v) self._remove_fake_quant_and_dequant_op(graph, op_node) -# Remove all fake dequant op + # Remove all fake dequant op ops = graph.all_op_nodes() for op_node in ops: op_name = op_node.name() @@ -1331,16 +1332,25 @@ def _is_float(self, v): def _quant(self, x, scale, num_bits, quant_axis): assert quant_axis in [0, 1], 'quant_axis should be 0 or 1 for now.' + bnt = (1 << (num_bits - 1)) - 1 + + def _clip(x, scale): + x[x > scale] = scale + x[x < -scale] = -scale + return x + if isinstance(scale, list): for i, s in enumerate(scale): if quant_axis == 0: - x[i] = np.round(x[i] / s * ((1 << (num_bits - 1)) - 1)) + x[i] = _clip(x[i], s) + x[i] = np.round(x[i] / s * bnt) else: - x[:, i] = np.round(x[:, i] / s * ( - (1 << (num_bits - 1)) - 1)) - return x + x[:, i] = _clip(x[:, i], s) + x[:, i] = np.round(x[:, i] / s * bnt) else: - return np.round(x / scale * ((1 << (num_bits - 1)) - 1)) + x = _clip(x, scale) + x = np.round(x / scale * bnt) + return x class ConvertToInt8Pass(object): diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt index dd4bea06572fb..6c02076eae0de 100644 --- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt +++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt @@ -98,18 +98,16 @@ function(download_quant_model install_dir data_file) endif() endfunction() -function(save_quant_ic_model_test target quant_model_dir fp32_model_save_path int8_model_save_path) +function(save_quant_ic_model_test target quant_model_dir int8_model_save_path) py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/save_quant_model.py ARGS --quant_model_path ${quant_model_dir} - --fp32_model_save_path ${fp32_model_save_path} --int8_model_save_path ${int8_model_save_path} --debug) endfunction() -function(save_quant_nlp_model_test target quant_model_dir fp32_model_save_path int8_model_save_path ops_to_quantize) +function(save_quant_nlp_model_test target quant_model_dir int8_model_save_path ops_to_quantize) py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/save_quant_model.py ARGS --quant_model_path ${quant_model_dir} - --fp32_model_save_path ${fp32_model_save_path} --int8_model_save_path ${int8_model_save_path} --ops_to_quantize ${ops_to_quantize}) endfunction() @@ -227,8 +225,6 @@ if(LINUX AND WITH_MKLDNN) set(NLP_LABLES_PATH "${NLP_DATA_DIR}/Ernie_dataset/label.xnli.dev") download_quant_data(${NLP_DATA_DIR} ${NLP_DATA_ARCHIVE}) - set(QUANT2_NLP_OPS_TO_QUANTIZE "fc,reshape2,transpose2,matmul,elementwise_add") - # Quant2 Ernie set(QUANT2_ERNIE_MODEL_ARCHIVE "ernie_qat.tar.gz") set(QUANT2_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_quant2") @@ -236,17 +232,25 @@ if(LINUX AND WITH_MKLDNN) set(FP32_ERNIE_MODEL_ARCHIVE "ernie_fp32_model.tar.gz") set(FP32_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_float") download_quant_fp32_model(${FP32_ERNIE_MODEL_DIR} ${FP32_ERNIE_MODEL_ARCHIVE}) - inference_quant2_int8_nlp_test(test_quant2_int8_ernie_mkldnn ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${FP32_ERNIE_MODEL_DIR}/ernie_fp32_model ${NLP_DATA_PATH} ${NLP_LABLES_PATH} ${QUANT2_NLP_OPS_TO_QUANTIZE}) + set(QUANT2_ERNIE_OPS_TO_QUANTIZE "fc,reshape2,transpose2,matmul,elementwise_add") + inference_quant2_int8_nlp_test(test_quant2_int8_ernie_mkldnn ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${FP32_ERNIE_MODEL_DIR}/ernie_fp32_model ${NLP_DATA_PATH} ${NLP_LABLES_PATH} ${QUANT2_ERNIE_OPS_TO_QUANTIZE}) + + # Quant2 GRU + set(QUANT2_GRU_MODEL_ARCHIVE "GRU_quant_acc.tar.gz") + set(QUANT2_GRU_MODEL_DIR "${QUANT_INSTALL_DIR}/GRU_quant2") + download_quant_model(${QUANT2_GRU_MODEL_DIR} ${QUANT2_GRU_MODEL_ARCHIVE}) + set(QUANT2_GRU_OPS_TO_QUANTIZE "fusion_gru") ### Save FP32 model or INT8 model from Quant model set(QUANT2_INT8_RESNET50_SAVE_PATH "${QUANT_INSTALL_DIR}/ResNet50_quant2_int8") - set(QUANT2_FP32_RESNET50_SAVE_PATH "${QUANT_INSTALL_DIR}/ResNet50_quant2_fp32") - save_quant_ic_model_test(save_quant2_model_resnet50 ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${QUANT2_FP32_RESNET50_SAVE_PATH} ${QUANT2_INT8_RESNET50_SAVE_PATH}) + save_quant_ic_model_test(save_quant2_model_resnet50 ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${QUANT2_INT8_RESNET50_SAVE_PATH}) set(QUANT2_INT8_ERNIE_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_int8") - set(QUANT2_FP32_ERNIE_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_fp32") - save_quant_nlp_model_test(save_quant2_model_ernie ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${QUANT2_FP32_ERNIE_SAVE_PATH} ${QUANT2_INT8_ERNIE_SAVE_PATH} ${QUANT2_NLP_OPS_TO_QUANTIZE}) + save_quant_nlp_model_test(save_quant2_model_ernie ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${QUANT2_INT8_ERNIE_SAVE_PATH} ${QUANT2_ERNIE_OPS_TO_QUANTIZE}) + + set(QUANT2_INT8_GRU_SAVE_PATH "${QUANT_INSTALL_DIR}/GRU_quant2_int8") + save_quant_nlp_model_test(save_quant2_model_gru ${QUANT2_GRU_MODEL_DIR}/GRU_quant_acc ${QUANT2_INT8_GRU_SAVE_PATH} ${QUANT2_GRU_OPS_TO_QUANTIZE}) # Convert Quant2 model to dot and pdf files set(QUANT2_INT8_ERNIE_DOT_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_int8_dot_file") diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py index df505cf2435e7..eb924e13a7e4f 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py @@ -31,6 +31,7 @@ from paddle.fluid.dygraph.nn import Pool2D from paddle.fluid.dygraph.nn import Linear from paddle.fluid.log_helper import get_logger +from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX paddle.enable_static() @@ -231,10 +232,11 @@ def test_qat_save(self): before_save = lenet(test_img) # save inference quantized model - path = "./mnist_infer_model" + path = "./qat_infer_model/lenet" + save_dir = "./qat_infer_model" paddle.jit.save( layer=lenet, - model_path=path, + path=path, input_spec=[ paddle.static.InputSpec( shape=[None, 1, 28, 28], dtype='float32') @@ -245,12 +247,12 @@ def test_qat_save(self): else: place = core.CPUPlace() exe = fluid.Executor(place) - [inference_program, feed_target_names, fetch_targets] = ( - fluid.io.load_inference_model( - dirname=path, - executor=exe, - model_filename="__model__", - params_filename="__variables__")) + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model( + dirname=save_dir, + executor=exe, + model_filename="lenet" + INFER_MODEL_SUFFIX, + params_filename="lenet" + INFER_PARAMS_SUFFIX) after_save, = exe.run(inference_program, feed={feed_target_names[0]: test_data}, fetch_list=fetch_targets) @@ -339,7 +341,7 @@ def _build_static_lenet(main, startup, is_test=False, seed=1000): paddle.jit.save( layer=lenet, - model_path="./dynamic_mnist", + path="./dynamic_mnist/model", input_spec=[ paddle.static.InputSpec( shape=[None, 1, 28, 28], dtype='float32') diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py index 80d388ac0da62..ddf37a0ebf8c2 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py @@ -31,6 +31,7 @@ from paddle.fluid.dygraph.nn import Pool2D from paddle.fluid.dygraph.nn import Linear from paddle.fluid.log_helper import get_logger +from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX paddle.enable_static() @@ -231,10 +232,11 @@ def test_qat_save(self): before_save = lenet(test_img) # save inference quantized model - path = "./mnist_infer_model" + path = "./qat_infer_model/mnist" + save_dir = "./qat_infer_model" paddle.jit.save( layer=lenet, - model_path=path, + path=path, input_spec=[ paddle.static.InputSpec( shape=[None, 1, 28, 28], dtype='float32') @@ -245,12 +247,12 @@ def test_qat_save(self): else: place = core.CPUPlace() exe = fluid.Executor(place) - [inference_program, feed_target_names, fetch_targets] = ( - fluid.io.load_inference_model( - dirname=path, - executor=exe, - model_filename="__model__", - params_filename="__variables__")) + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model( + dirname=save_dir, + executor=exe, + model_filename="mnist" + INFER_MODEL_SUFFIX, + params_filename="mnist" + INFER_PARAMS_SUFFIX) after_save, = exe.run(inference_program, feed={feed_target_names[0]: test_data}, fetch_list=fetch_targets) @@ -339,7 +341,7 @@ def _build_static_lenet(main, startup, is_test=False, seed=1000): paddle.jit.save( layer=lenet, - model_path="./dynamic_mnist", + path="./dynamic_mnist/model", input_spec=[ paddle.static.InputSpec( shape=[None, 1, 28, 28], dtype='float32') diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py index 9a14c4cdf14a4..ad116c2597064 100644 --- a/python/paddle/fluid/core.py +++ b/python/paddle/fluid/core.py @@ -205,8 +205,15 @@ def pre_load(dso_name): load_dso(dso_path) -def get_glibc_ver(): - return run_shell_command("ldd --version | awk '/ldd/{print $NF}'") +def get_libc_ver(): + ldd_glibc = run_shell_command("ldd --version | awk '/ldd/{print $NF}'") + if ldd_glibc is not None: + return ("glibc", ldd_glibc) + + ldd_musl = run_shell_command("ldd 2>&1 | awk '/Version/{print $NF}'") + if ldd_musl is not None: + return ("musl", ldd_musl) + return (None, None) def less_than_ver(a, b): @@ -231,13 +238,14 @@ def to_list(s): # For paddle, the problem is that 'libgomp' is a DSO with static TLS, and it is loaded after 14 DSOs. # So, here is a tricky way to solve the problem by pre load 'libgomp' before 'core_avx.so'. # The final solution is to upgrade glibc to > 2.22 on the target system. -if platform.system().lower() == 'linux' and less_than_ver(get_glibc_ver(), - '2.23'): - try: - pre_load('libgomp') - except Exception as e: - # NOTE(zhiqiu): do not abort if failed, since it may success when import core_avx.so - sys.stderr.write('Error: Can not preload libgomp.so') +if platform.system().lower() == 'linux': + libc_type, libc_ver = get_libc_ver() + if libc_type == 'glibc' and less_than_ver(libc_ver, '2.23'): + try: + pre_load('libgomp') + except Exception as e: + # NOTE(zhiqiu): do not abort if failed, since it may success when import core_avx.so + sys.stderr.write('Error: Can not preload libgomp.so') load_noavx = False diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py index f4ea4d670e600..fb87ea4455d34 100644 --- a/python/paddle/fluid/dygraph/checkpoint.py +++ b/python/paddle/fluid/dygraph/checkpoint.py @@ -24,8 +24,8 @@ import warnings from .. import core from .base import guard -from paddle.fluid.dygraph.jit import SaveLoadConfig, deprecate_save_load_configs -from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers, EXTRA_VAR_INFO_FILENAME +from paddle.fluid.dygraph.jit import _SaveLoadConfig +from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers __all__ = [ 'save_dygraph', @@ -33,35 +33,23 @@ ] -# NOTE(chenweihang): deprecate load_dygraph's argument keep_name_table, -# ensure compatibility when user still use keep_name_table argument -def deprecate_keep_name_table(func): - @functools.wraps(func) - def wrapper(*args, **kwargs): - def __warn_and_build_configs__(keep_name_table): - warnings.warn( - "The argument `keep_name_table` has deprecated, please use `SaveLoadConfig.keep_name_table`.", - DeprecationWarning) - config = SaveLoadConfig() - config.keep_name_table = keep_name_table - return config - - # deal with arg `keep_name_table` - if len(args) > 1 and isinstance(args[1], bool): - args = list(args) - args[1] = __warn_and_build_configs__(args[1]) - # deal with kwargs - elif 'keep_name_table' in kwargs: - kwargs['config'] = __warn_and_build_configs__(kwargs[ - 'keep_name_table']) - kwargs.pop('keep_name_table') - else: - # do nothing - pass +def _parse_load_config(configs): + supported_configs = ['model_filename', 'params_filename', 'keep_name_table'] + + # input check + for key in configs: + if key not in supported_configs: + raise ValueError( + "The additional config (%s) of `paddle.fluid.load_dygraph` is not supported." + % (key)) - return func(*args, **kwargs) + # construct inner config + inner_config = _SaveLoadConfig() + inner_config.model_filename = configs.get('model_filename', None) + inner_config.params_filename = configs.get('params_filename', None) + inner_config.keep_name_table = configs.get('keep_name_table', None) - return wrapper + return inner_config @dygraph_only @@ -132,12 +120,12 @@ def save_dygraph(state_dict, model_path): pickle.dump(model_dict, f, protocol=2) +# NOTE(chenweihang): load_dygraph will deprecated in future, we don't +# support new loading features for it # TODO(qingqing01): remove dygraph_only to support loading static model. # maybe need to unify the loading interface after 2.0 API is ready. # @dygraph_only -@deprecate_save_load_configs -@deprecate_keep_name_table -def load_dygraph(model_path, config=None): +def load_dygraph(model_path, **configs): ''' :api_attr: imperative @@ -152,10 +140,13 @@ def load_dygraph(model_path, config=None): Args: model_path(str) : The file prefix store the state_dict. (The path should Not contain suffix '.pdparams') - config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` - object that specifies additional configuration options, these options - are for compatibility with ``jit.save/io.save_inference_model`` formats. - Default None. + **configs (dict, optional): other save configuration options for compatibility. We do not + recommend using these configurations, if not necessary, DO NOT use them. Default None. + The following options are currently supported: + (1) model_filename (string): The inference model file name of the paddle 1.x ``save_inference_model`` + save format. Default file name is :code:`__model__` . + (2) params_filename (string): The persistable variables file name of the paddle 1.x ``save_inference_model`` + save format. No default file name, save variables separately by default. Returns: state_dict(dict) : the dict store the state_dict @@ -196,8 +187,7 @@ def load_dygraph(model_path, config=None): opti_file_path = model_prefix + ".pdopt" # deal with argument `config` - if config is None: - config = SaveLoadConfig() + config = _parse_load_config(configs) if os.path.exists(params_file_path) or os.path.exists(opti_file_path): # Load state dict by `save_dygraph` save format @@ -246,7 +236,6 @@ def load_dygraph(model_path, config=None): persistable_var_dict = _construct_params_and_buffers( model_prefix, programs, - config.separate_params, config.params_filename, append_suffix=False) @@ -255,9 +244,9 @@ def load_dygraph(model_path, config=None): for var_name in persistable_var_dict: para_dict[var_name] = persistable_var_dict[var_name].numpy() - # if __variables.info__ exists, we can recover structured_name - var_info_path = os.path.join(model_prefix, - EXTRA_VAR_INFO_FILENAME) + # if *.info exists, we can recover structured_name + var_info_filename = str(config.params_filename) + ".info" + var_info_path = os.path.join(model_prefix, var_info_filename) if os.path.exists(var_info_path): with open(var_info_path, 'rb') as f: extra_var_info = pickle.load(f) diff --git a/python/paddle/fluid/dygraph/container.py b/python/paddle/fluid/dygraph/container.py index 8a8787da3a543..bfcb43f5f677c 100644 --- a/python/paddle/fluid/dygraph/container.py +++ b/python/paddle/fluid/dygraph/container.py @@ -34,27 +34,26 @@ class Sequential(Layer): Examples: .. code-block:: python - import paddle.fluid as fluid + import paddle import numpy as np data = np.random.uniform(-1, 1, [30, 10]).astype('float32') - with fluid.dygraph.guard(): - data = fluid.dygraph.to_variable(data) - # create Sequential with iterable Layers - model1 = fluid.dygraph.Sequential( - fluid.Linear(10, 1), fluid.Linear(1, 2) - ) - model1[0] # access the first layer - res1 = model1(data) # sequential execution - - # create Sequential with name Layer pairs - model2 = fluid.dygraph.Sequential( - ('l1', fluid.Linear(10, 2)), - ('l2', fluid.Linear(2, 3)) - ) - model2['l1'] # access l1 layer - model2.add_sublayer('l3', fluid.Linear(3, 3)) # add sublayer - res2 = model2(data) # sequential execution + data = paddle.to_tensor(data) + # create Sequential with iterable Layers + model1 = paddle.nn.Sequential( + paddle.nn.Linear(10, 1), paddle.nn.Linear(1, 2) + ) + model1[0] # access the first layer + res1 = model1(data) # sequential execution + + # create Sequential with name Layer pairs + model2 = paddle.nn.Sequential( + ('l1', paddle.nn.Linear(10, 2)), + ('l2', paddle.nn.Linear(2, 3)) + ) + model2['l1'] # access l1 layer + model2.add_sublayer('l3', paddle.nn.Linear(3, 3)) # add sublayer + res2 = model2(data) # sequential execution """ @@ -99,15 +98,15 @@ class ParameterList(Layer): Examples: .. code-block:: python - import paddle.fluid as fluid + import paddle import numpy as np - class MyLayer(fluid.Layer): + class MyLayer(paddle.nn.Layer): def __init__(self, num_stacked_param): super(MyLayer, self).__init__() # create ParameterList with iterable Parameters - self.params = fluid.dygraph.ParameterList( - [fluid.layers.create_parameter( + self.params = paddle.nn.ParameterList( + [paddle.create_parameter( shape=[2, 2], dtype='float32')] * num_stacked_param) def forward(self, x): @@ -119,27 +118,26 @@ def forward(self, x): "Y": p}, outputs={"Out": tmp}, attrs={"x_num_col_dims": 1, - "y_num_col_dims": 1}) + "y_num_col_dims": 1}) x = tmp return x data_np = np.random.uniform(-1, 1, [5, 2]).astype('float32') - with fluid.dygraph.guard(): - x = fluid.dygraph.to_variable(data_np) - num_stacked_param = 4 - model = MyLayer(num_stacked_param) - print(len(model.params)) # 4 - res = model(x) - print(res.shape) # [5, 2] - - replaced_param = fluid.layers.create_parameter(shape=[2, 3], dtype='float32') - model.params[num_stacked_param - 1] = replaced_param # replace last param - res = model(x) - print(res.shape) # [5, 3] - model.params.append(fluid.layers.create_parameter(shape=[3, 4], dtype='float32')) # append param - print(len(model.params)) # 5 - res = model(x) - print(res.shape) # [5, 4] + x = paddle.to_tensor(data_np) + num_stacked_param = 4 + model = MyLayer(num_stacked_param) + print(len(model.params)) # 4 + res = model(x) + print(res.shape) # [5, 2] + + replaced_param = paddle.create_parameter(shape=[2, 3], dtype='float32') + model.params[num_stacked_param - 1] = replaced_param # replace last param + res = model(x) + print(res.shape) # [5, 3] + model.params.append(paddle.create_parameter(shape=[3, 4], dtype='float32')) # append param + print(len(model.params)) # 5 + res = model(x) + print(res.shape) # [5, 4] """ def __init__(self, parameters=None): @@ -183,14 +181,15 @@ class LayerList(Layer): Examples: .. code-block:: python - import paddle.fluid as fluid + + import paddle import numpy as np - class MyLayer(fluid.Layer): + class MyLayer(paddle.nn.Layer): def __init__(self): super(MyLayer, self).__init__() - self.linears = fluid.dygraph.LayerList( - [fluid.dygraph.Linear(10, 10) for i in range(10)]) + self.linears = paddle.nn.LayerList( + [paddle.nn.Linear(10, 10) for i in range(10)]) def forward(self, x): # LayerList can act as an iterable, or be indexed using ints @@ -239,13 +238,13 @@ def append(self, sublayer): Examples: .. code-block:: python - import paddle.fluid as fluid - with fluid.dygraph.guard(): - linears = fluid.dygraph.LayerList([fluid.dygraph.Linear(10, 10) for i in range(10)]) - another = fluid.dygraph.Linear(10, 10) - linears.append(another) - print(len(linears)) # 11 + import paddle + + linears = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(10)]) + another = paddle.nn.Linear(10, 10) + linears.append(another) + print(len(linears)) # 11 """ self.add_sublayer(str(len(self)), sublayer) return self @@ -260,13 +259,13 @@ def insert(self, index, sublayer): Examples: .. code-block:: python - import paddle.fluid as fluid - with fluid.dygraph.guard(): - linears = fluid.dygraph.LayerList([fluid.dygraph.Linear(10, 10) for i in range(10)]) - another = fluid.dygraph.Linear(10, 10) - linears.insert(3, another) - print(linears[3] is another) # True + import paddle + + linears = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(10)]) + another = paddle.nn.Linear(10, 10) + linears.insert(3, another) + print(linears[3] is another) # True """ assert isinstance(index, int) and \ 0 <= index < len(self._sub_layers), \ @@ -284,14 +283,14 @@ def extend(self, sublayers): Examples: .. code-block:: python - import paddle.fluid as fluid - - with fluid.dygraph.guard(): - linears = fluid.dygraph.LayerList([fluid.dygraph.Linear(10, 10) for i in range(10)]) - another_list = fluid.dygraph.LayerList([fluid.dygraph.Linear(10, 10) for i in range(5)]) - linears.extend(another_list) - print(len(linears)) # 15 - print(another_list[0] is linears[10]) # True + + import paddle + + linears = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(10)]) + another_list = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(5)]) + linears.extend(another_list) + print(len(linears)) # 15 + print(another_list[0] is linears[10]) # True """ offset = len(self) for i, sublayer in enumerate(sublayers): diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py index 4a3dacbd1acae..a10adeb14aa7d 100644 --- a/python/paddle/fluid/dygraph/io.py +++ b/python/paddle/fluid/dygraph/io.py @@ -31,8 +31,10 @@ __all__ = ['TranslatedLayer'] -VARIABLE_FILENAME = "__variables__" -EXTRA_VAR_INFO_FILENAME = "__variables.info__" +INFER_MODEL_SUFFIX = ".pdmodel" +INFER_PARAMS_SUFFIX = ".pdiparams" +INFER_PARAMS_INFO_SUFFIX = ".pdiparams.info" + LOADED_VAR_SUFFIX = "load" PARAMETER_NAME_PREFIX = "param" BUFFER_NAME_PREFIX = "buffer" @@ -424,11 +426,8 @@ def _load_persistable_vars_by_program(model_path, return load_var_dict -def _load_persistable_vars(model_path, - var_info_path, - program_holder, - separate_params=False, - params_filename=None): +def _load_persistable_vars(model_path, var_info_path, program_holder, + params_filename): # 1. load extra var info with open(var_info_path, 'rb') as f: extra_var_info = pickle.load(f) @@ -464,33 +463,22 @@ def _load_persistable_vars(model_path, new_var = framework._varbase_creator( name=new_name, persistable=True) - # load separate vars - if separate_params is True: - framework._dygraph_tracer().trace_op( - type='load', - inputs={}, - outputs={'Out': new_var}, - attrs={'file_path': os.path.join(model_path, name)}) - new_var.stop_gradient = extra_var_info[name]['stop_gradient'] load_var_dict[new_name] = new_var load_var_list.append(new_var) # 3. load all vars - if separate_params is False: - if params_filename is not None: - var_file_path = os.path.join(model_path, params_filename) - else: - var_file_path = os.path.join(model_path, VARIABLE_FILENAME) - if not os.path.exists(var_file_path): - if len(extra_var_info) != 0: - raise ValueError("The model to be loaded is incomplete.") - else: - framework._dygraph_tracer().trace_op( - type='load_combine', - inputs={}, - outputs={'Out': load_var_list}, - attrs={'file_path': var_file_path}) + assert params_filename is not None, "params_filename should not be None." + var_file_path = os.path.join(model_path, params_filename) + if not os.path.exists(var_file_path): + if len(extra_var_info) != 0: + raise ValueError("The model to be loaded is incomplete.") + else: + framework._dygraph_tracer().trace_op( + type='load_combine', + inputs={}, + outputs={'Out': load_var_list}, + attrs={'file_path': var_file_path}) return load_var_dict @@ -532,14 +520,13 @@ def _construct_program_holders(model_path, model_filename=None): def _construct_params_and_buffers(model_path, programs, - separate_params=False, params_filename=None, append_suffix=True): - var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME) + var_info_filename = str(params_filename) + ".info" + var_info_path = os.path.join(model_path, var_info_filename) if os.path.exists(var_info_path): var_dict = _load_persistable_vars(model_path, var_info_path, - programs['forward'], separate_params, - params_filename) + programs['forward'], params_filename) else: var_dict = _load_persistable_vars_by_program( model_path, programs['forward'], params_filename) @@ -700,18 +687,16 @@ def _construct(model_path, configs=None): raise ValueError("There is no directory named '%s'" % model_path) model_filename = None params_filename = None - separate_params = False if configs is not None: model_filename = configs.model_filename params_filename = configs.params_filename - separate_params = configs.separate_params # 1. load program desc & construct _ProgramHolder programs = _construct_program_holders(model_path, model_filename) # 2. load layer parameters & buffers - persistable_vars = _construct_params_and_buffers( - model_path, programs, separate_params, params_filename) + persistable_vars = _construct_params_and_buffers(model_path, programs, + params_filename) # 3. construct TranslatedLayer object translated_layer = TranslatedLayer(programs, persistable_vars) diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py index 194ebafb08eef..6cdd13fba82ac 100644 --- a/python/paddle/fluid/dygraph/jit.py +++ b/python/paddle/fluid/dygraph/jit.py @@ -29,7 +29,7 @@ from paddle.fluid.dygraph.dygraph_to_static import logging_utils from paddle.fluid.dygraph.dygraph_to_static.logging_utils import set_code_level, set_verbosity from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, StaticFunction, unwrap_decorators -from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer +from paddle.fluid.dygraph.io import TranslatedLayer, INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX from paddle.fluid.dygraph.layers import Layer from paddle.fluid.executor import Executor, scope_guard from paddle.fluid.framework import Block, ParamBase, Program, Variable @@ -39,7 +39,7 @@ __all__ = [ 'TracedLayer', 'declarative', 'dygraph_to_static_func', 'set_code_level', - 'set_verbosity', 'save', 'load', 'SaveLoadConfig' + 'set_verbosity', 'save', 'load' ] @@ -228,73 +228,7 @@ def decorated(python_func): return decorated -class SaveLoadConfig(object): - """ - The additional configuration options may be used in function - ``paddle.jit.save/load`` and ``paddle.load`` . - - Examples: - 1. Using ``SaveLoadConfig`` when saving model - - .. code-block:: python - - import paddle - import paddle.nn as nn - import paddle.optimizer as opt - - class SimpleNet(nn.Layer): - def __init__(self, in_size, out_size): - super(SimpleNet, self).__init__() - self._linear = nn.Linear(in_size, out_size) - - @paddle.jit.to_static - def forward(self, x): - y = self._linear(x) - z = self._linear(y) - return z - - # enable dygraph mode - paddle.disable_static() - - # train model - net = SimpleNet(8, 8) - adam = opt.Adam(learning_rate=0.1, parameters=net.parameters()) - x = paddle.randn([4, 8], 'float32') - for i in range(10): - out = net(x) - loss = paddle.tensor.mean(out) - loss.backward() - adam.step() - adam.clear_grad() - - # use SaveLoadconfig when saving model - model_path = "simplenet.example.model" - config = paddle.SaveLoadConfig() - config.model_filename = "__simplenet__" - paddle.jit.save( - layer=net, - model_path=model_path, - config=config) - - 2. Using ``SaveLoadConfig`` when loading model - - .. code-block:: python - - import paddle - - # enable dygraph mode - paddle.disable_static() - - # use SaveLoadconfig when loading model - model_path = "simplenet.example.model" - config = paddle.SaveLoadConfig() - config.model_filename = "__simplenet__" - infer_net = paddle.jit.load(model_path, config=config) - # inference - x = paddle.randn([4, 8], 'float32') - pred = infer_net(x) - """ - +class _SaveLoadConfig(object): def __init__(self): self._output_spec = None self._model_filename = None @@ -316,335 +250,105 @@ def __init__(self): @property def output_spec(self): - """ - Selects the output targets of the saved model ( ``paddle.jit.TranslatedLayer`` ). - By default, all return variables of original Layer's forward function - are kept as the output of the saved TranslatedLayer. - - The ``output_spec`` type should be list[Variable]. If the provided ``output_spec`` - list is not all output variables, the saved model will be pruned according to the - given ``output_spec`` list. - - .. note:: - The ``output_spec`` is only used when saving model. - - Examples: - .. code-block:: python - - import paddle - import paddle.nn as nn - import paddle.optimizer as opt - - class SimpleNet(nn.Layer): - def __init__(self, in_size, out_size): - super(SimpleNet, self).__init__() - self._linear = nn.Linear(in_size, out_size) - - @paddle.jit.to_static - def forward(self, x): - y = self._linear(x) - z = self._linear(y) - loss = paddle.tensor.mean(z) - return z, loss - - # enable dygraph mode - paddle.disable_static() - - # train model - net = SimpleNet(8, 8) - adam = opt.Adam(learning_rate=0.1, parameters=net.parameters()) - x = paddle.randn([4, 8], 'float32') - for i in range(10): - out, loss = net(x) - loss.backward() - adam.step() - adam.clear_grad() - - # use SaveLoadconfig.output_spec - model_path = "simplenet.example.model.output_spec" - config = paddle.SaveLoadConfig() - config.output_spec = [out] - paddle.jit.save( - layer=net, - model_path=model_path, - config=config) - - infer_net = paddle.jit.load(model_path) - x = paddle.randn([4, 8], 'float32') - pred = infer_net(x) - """ return self._output_spec @output_spec.setter def output_spec(self, spec): + if spec is None: + return if not isinstance(spec, list): raise TypeError( - "The SaveLoadConfig.output_spec should be 'list', but received input type is %s." + "The config `output_spec` should be 'list', but received input type is %s." % type(input)) for var in spec: if not isinstance(var, core.VarBase): raise TypeError( - "The element in SaveLoadConfig.output_spec list should be 'Variable', but received element's type is %s." + "The element in config `output_spec` list should be 'Variable', but received element's type is %s." % type(var)) self._output_spec = spec @property def model_filename(self): - """ - The name of file to save the translated program of target Layer. - Default filename is :code:`__model__` . - - Examples: - .. code-block:: python - - import paddle - import paddle.nn as nn - import paddle.optimizer as opt - - class SimpleNet(nn.Layer): - def __init__(self, in_size, out_size): - super(SimpleNet, self).__init__() - self._linear = nn.Linear(in_size, out_size) - - @paddle.jit.to_static - def forward(self, x): - y = self._linear(x) - z = self._linear(y) - return z - - # enable dygraph mode - paddle.disable_static() - - # train model - net = SimpleNet(8, 8) - adam = opt.Adam(learning_rate=0.1, parameters=net.parameters()) - x = paddle.randn([4, 8], 'float32') - for i in range(10): - out = net(x) - loss = paddle.tensor.mean(out) - loss.backward() - adam.step() - adam.clear_grad() - - # saving with configs.model_filename - model_path = "simplenet.example.model.model_filename" - config = paddle.SaveLoadConfig() - config.model_filename = "__simplenet__" - paddle.jit.save( - layer=net, - model_path=model_path, - config=config) - - # loading with configs.model_filename - infer_net = paddle.jit.load(model_path, config=config) - x = paddle.randn([4, 8], 'float32') - pred = infer_net(x) - """ return self._model_filename @model_filename.setter def model_filename(self, filename): + if filename is None: + return if not isinstance(filename, six.string_types): raise TypeError( - "The SaveLoadConfig.model_filename should be str, but received input's type is %s." + "The config `model_filename` should be str, but received input's type is %s." % type(filename)) if len(filename) == 0: - raise ValueError( - "The SaveLoadConfig.model_filename is empty string.") + raise ValueError("The config `model_filename` is empty string.") self._model_filename = filename @property def params_filename(self): - """ - The name of file to save all persistable variables in target Layer. - Default file name is :code:`__variables__` . - - Examples: - .. code-block:: python - - import paddle - import paddle.nn as nn - import paddle.optimizer as opt - - class SimpleNet(nn.Layer): - def __init__(self, in_size, out_size): - super(SimpleNet, self).__init__() - self._linear = nn.Linear(in_size, out_size) - - @paddle.jit.to_static - def forward(self, x): - y = self._linear(x) - z = self._linear(y) - return z - - # enable dygraph mode - paddle.disable_static() - - # train model - net = SimpleNet(8, 8) - adam = opt.Adam(learning_rate=0.1, parameters=net.parameters()) - x = paddle.randn([4, 8], 'float32') - for i in range(10): - out = net(x) - loss = paddle.tensor.mean(out) - loss.backward() - adam.step() - adam.clear_grad() - - model_path = "simplenet.example.model.params_filename" - config = paddle.SaveLoadConfig() - config.params_filename = "__params__" - - # saving with configs.params_filename - paddle.jit.save( - layer=net, - model_path=model_path, - config=config) - - # loading with configs.params_filename - infer_net = paddle.jit.load(model_path, config=config) - x = paddle.randn([4, 8], 'float32') - pred = infer_net(x) - """ return self._params_filename @params_filename.setter def params_filename(self, filename): + if filename is None: + return if not isinstance(filename, six.string_types): raise TypeError( - "The SaveLoadConfig.params_filename should be str, but received input's type is %s." + "The config `params_filename` should be str, but received input's type is %s." % type(filename)) if len(filename) == 0: - raise ValueError( - "The SaveLoadConfig.params_filename is empty string.") + raise ValueError("The config `params_filename` is empty string.") self._params_filename = filename - # NOTE: [why not use params_filename=None control params saved separately] - # The new save interface does not recommend parameters to be saved separately. - # Here, the concept should be separated as clearly as possible. - # Setting params_filename=None only means that the saved file name is set - # and without any other meaning. New separate_params control for file saved - # separately can makes the concept clearer. - @property - def separate_params(self): - """ - Configure whether to save the Layer parameters as separete files. - (In order to be compatible with the behavior of ``paddle.static.save_inference_model`` ) - - If True, each parameter will be saved to a file separately, the file name is the parameter name, - and the SaveLoadConfig.params_filename configuration will not take effect. Default False. - - .. note:: - Only used for ``paddle.jit.save`` . - - Examples: - .. code-block:: python - - import paddle - import paddle.nn as nn - import paddle.optimizer as opt - - class SimpleNet(nn.Layer): - def __init__(self, in_size, out_size): - super(SimpleNet, self).__init__() - self._linear = nn.Linear(in_size, out_size) - - @paddle.jit.to_static - def forward(self, x): - y = self._linear(x) - z = self._linear(y) - return z - - # enable dygraph mode - paddle.disable_static() - - # train model - net = SimpleNet(8, 8) - adam = opt.Adam(learning_rate=0.1, parameters=net.parameters()) - x = paddle.randn([4, 8], 'float32') - for i in range(10): - out = net(x) - loss = paddle.tensor.mean(out) - loss.backward() - adam.step() - adam.clear_grad() - - model_path = "simplenet.example.model.separate_params" - config = paddle.SaveLoadConfig() - config.separate_params = True - - # saving with configs.separate_params - paddle.jit.save( - layer=net, - model_path=model_path, - config=config) - # [result] the saved model directory contains: - # linear_0.b_0 linear_0.w_0 __model__ __variables.info__ - - # loading with configs.params_filename - infer_net = paddle.jit.load(model_path, config=config) - x = paddle.randn([4, 8], 'float32') - pred = infer_net(x) - """ - return self._separate_params - - @separate_params.setter - def separate_params(self, value): - if not isinstance(value, bool): - raise TypeError( - "The SaveLoadConfig.separate_params should be bool value, but received input's type is %s." - % type(value)) - self._separate_params = value - @property def keep_name_table(self): - """ - Configures whether keep ``structured_name -> parameter_name`` dict in loaded state dict. - This dict is the debugging information saved when call ``paddle.save`` . - It is generally only used for debugging and does not affect the actual training or inference. - By default, it will not be retained in ``paddle.load`` result. Default: False. - - .. note:: - Only used for ``paddle.load`` . - - Examples: - .. code-block:: python - - import paddle - - paddle.disable_static() - - linear = paddle.nn.Linear(5, 1) - - state_dict = linear.state_dict() - paddle.save(state_dict, "paddle_dy.pdparams") - - config = paddle.SaveLoadConfig() - config.keep_name_table = True - para_state_dict = paddle.load("paddle_dy.pdparams", config) - - print(para_state_dict) - # the name_table is 'StructuredToParameterName@@' - # {'bias': array([0.], dtype=float32), - # 'StructuredToParameterName@@': - # {'bias': u'linear_0.b_0', 'weight': u'linear_0.w_0'}, - # 'weight': array([[ 0.04230034], - # [-0.1222527 ], - # [ 0.7392676 ], - # [-0.8136974 ], - # [ 0.01211023]], dtype=float32)} - """ return self._keep_name_table @keep_name_table.setter def keep_name_table(self, value): + if value is None: + return if not isinstance(value, bool): raise TypeError( - "The SaveLoadConfig.keep_name_table should be bool value, but received input's type is %s." + "The config `keep_name_table` should be bool value, but received input's type is %s." % type(value)) self._keep_name_table = value +def _parse_save_configs(configs): + supported_configs = ['output_spec'] + + # input check + for key in configs: + if key not in supported_configs: + raise ValueError( + "The additional config (%s) of `paddle.jit.save` is not supported." + % (key)) + + # construct inner config + inner_config = _SaveLoadConfig() + inner_config.output_spec = configs.get('output_spec', None) + + return inner_config + + +def _parse_load_config(configs): + supported_configs = ['model_filename', 'params_filename'] + + # input check + for key in configs: + if key not in supported_configs: + raise ValueError( + "The additional config (%s) of `paddle.jit.load` is not supported." + % (key)) + + # construct inner config + inner_config = _SaveLoadConfig() + inner_config.model_filename = configs.get('model_filename', None) + inner_config.params_filename = configs.get('params_filename', None) + + return inner_config + + def _get_input_var_names(inputs, input_spec): name_none_error = "The %s's name is None. " \ "When using jit.save, please set InputSepc's name in " \ @@ -712,47 +416,88 @@ def _get_output_vars(outputs, output_spec): return result_list -# NOTE(chenweihang): change jit.save/load argument `configs` to `config` -def deprecate_save_load_configs(func): - @functools.wraps(func) - def wrapper(*args, **kwargs): - if 'configs' in kwargs: - kwargs['config'] = kwargs['configs'] - kwargs.pop('configs') - return func(*args, **kwargs) +# NOTE(chenweihang): [ Handling of use cases of API paddle.jit.load ] +# `paddle.jit.load` may be used to load saved results of: +# 1. Expected cases: +# - paddle.jit.save +# - paddle.static.save_inference_model +# - paddle.fluid.io.save_inference_model +# 2. Error cases: +# - paddle.save: no .pdmodel for prefix +# - paddle.static.save: no .pdiparams but .pdparams exists +# - paddle.fluid.io.save_params/save_persistables: no __model__ +# TODO(chenweihang): polish error message in above error cases +def _build_load_path_and_config(path, config): + # NOTE(chenweihang): If both [prefix save format] and [directory save format] exist, + # raise error, avoid confusing behavior + prefix_format_path = path + INFER_MODEL_SUFFIX + prefix_format_exist = os.path.exists(prefix_format_path) + directory_format_exist = os.path.isdir(path) + if prefix_format_exist and directory_format_exist: + raise ValueError( + "The %s.pdmodel and %s directory exist at the same time, " + "don't know which one to load, please make sure that the specified target " + "of ``path`` is unique." % (path, path)) + elif not prefix_format_exist and not directory_format_exist: + raise ValueError("The ``path`` (%s) to load model not exists." % path) + else: + if prefix_format_exist: + file_prefix = os.path.basename(path) + model_path = os.path.dirname(path) + if config.model_filename is not None: + warnings.warn( + "When loading the result saved with the " + "specified file prefix, the ``model_filename`` config does " + "not take effect.") + config.model_filename = file_prefix + INFER_MODEL_SUFFIX + if config.params_filename is not None: + warnings.warn( + "When loading the result saved with the " + "specified file prefix, the ``params_filename`` config does " + "not take effect.") + config.params_filename = file_prefix + INFER_PARAMS_SUFFIX + else: + # Compatible with the old save_inference_model format + model_path = path - return wrapper + return model_path, config -@deprecate_save_load_configs @switch_to_static_graph -def save(layer, model_path, input_spec=None, config=None): +def save(layer, path, input_spec=None, **configs): """ - Saves input declarative Layer as :ref:`api_imperative_TranslatedLayer` + Saves input Layer as ``paddle.jit.TranslatedLayer`` format model, which can be used for inference or fine-tuning after loading. It will save the translated program and all related persistable - variables of input declarative Layer to given ``model_path``. + variables of input Layer to given ``path``. - The default saved translated program file name is ``__model__``, - and the default saved persistable variables file name is ``__variables__``, - and it also saved some additional variable description information to file - ``__variables.info__``, these additional information is used in fine-tuning. + ``path`` is the prefix of saved objects, and the saved translated program file + suffix is ``.pdmodel``, the saved persistable variables file suffix is ``.pdiparams``, + and here also saved some additional variable description information to a file, + its suffix is ``.pdiparams.info``, these additional information is used in fine-tuning. The saved model can be loaded by follow APIs: - - :ref:`api_imperative_jit_load` - - :ref:`api_fluid_io_load_inference_model` (need pass ``params_filename='__variables__'``) + - ``paddle.jit.load`` + - ``paddle.static.load_inference_model`` - Other C++ inference APIs Args: - layer (Layer): the Layer to be saved. The Layer should be decorated by `@declarative`. - model_path (str): the directory to save the model. - input_spec (list[Variable], optional): Describes the input of the saved model. + layer (Layer): the Layer to be saved. The Layer should be decorated by `@paddle.jit.to_static`. + path (str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``. + input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model. It is the example inputs that will be passed to saved TranslatedLayer's forward function. If None, all input variables of the original Layer's forward function would be the inputs of the saved model. Default None. - config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object - that specifies additional configuration options. Default None. + **configs (dict, optional): other save configuration options for compatibility. We do not + recommend using these configurations, they may be removed in the future. If not necessary, + DO NOT use them. Default None. + The following options are currently supported: + (1) output_spec (list[Tensor]): Selects the output targets of the saved model. + By default, all return variables of original Layer's forward function are kept as the + output of the saved model. If the provided ``output_spec`` list is not all output variables, + the saved model will be pruned according to the given ``output_spec`` list. + Returns: None @@ -804,10 +549,6 @@ def train(layer, loader, loss_fn, opt): print("Epoch {} batch {}: loss = {}".format( epoch_id, batch_id, np.mean(loss.numpy()))) - # enable dygraph mode - place = paddle.CPUPlace() - paddle.disable_static(place) - # 1. train & save model. # create network @@ -818,7 +559,6 @@ def train(layer, loader, loss_fn, opt): # create data loader dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) loader = paddle.io.DataLoader(dataset, - places=place, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, @@ -828,11 +568,11 @@ def train(layer, loader, loss_fn, opt): train(layer, loader, loss_fn, adam) # save - model_path = "linear.example.model" - paddle.jit.save(layer, model_path) + path = "example_model/linear" + paddle.jit.save(layer, path) """ - # 1. input check + # 1. input build & check prog_translator = ProgramTranslator() if not prog_translator.enable_to_static: raise RuntimeError( @@ -843,9 +583,17 @@ def train(layer, loader, loss_fn, opt): "The input layer of paddle.jit.save should be 'Layer', but received layer type is %s." % type(layer)) - configs = config - if configs is None: - configs = SaveLoadConfig() + # path check + file_prefix = os.path.basename(path) + if file_prefix == "": + raise ValueError( + "The input path MUST be format of dirname/file_prefix " + "[dirname\\file_prefix in Windows system], but received " + "file_prefix is empty string.") + + dirname = os.path.dirname(path) + if dirname and not os.path.exists(dirname): + os.makedirs(dirname) # avoid change user given input_spec inner_input_spec = None @@ -866,6 +614,9 @@ def train(layer, loader, loss_fn, opt): "The element in input_spec list should be 'Variable' or `paddle.static.InputSpec`, but received element's type is %s." % type(var)) + # parse configs + configs = _parse_save_configs(configs) + # 2. get program from Layer # TODO(chenweihang): add support for other method, not only forward if isinstance(layer.forward, StaticFunction): @@ -927,9 +678,12 @@ def train(layer, loader, loss_fn, opt): # 5. save inference model from paddle.fluid.io import save_inference_model - # VARIABLE_FILENAME keep nameing style consistent with '__model__' - if configs.params_filename is None: - configs.params_filename = VARIABLE_FILENAME + # construct new save_inference_model arguments + model_path = dirname + # NOTE(chenweihang): because prefix contains model and params filename, + # so we don't support set model_filename & params_filename + model_filename = file_prefix + INFER_MODEL_SUFFIX + params_filename = file_prefix + INFER_PARAMS_SUFFIX with scope_guard(scope): save_inference_model( @@ -938,9 +692,8 @@ def train(layer, loader, loss_fn, opt): target_vars=output_vars, executor=Executor(_current_expected_place()), main_program=concrete_program.main_program.clone(), - model_filename=configs.model_filename, - params_filename=None - if configs.separate_params else configs.params_filename, + model_filename=model_filename, + params_filename=params_filename, export_for_deployment=configs._export_for_deployment, program_only=configs._program_only) @@ -958,23 +711,23 @@ def train(layer, loader, loss_fn, opt): # Due to compatibility issues, we cannot change the original storage structure, # but we can save these information in `jit.save` without changing the original # storage to improve user experience. So we save extra information into - # file `__variables.info__` - extra_var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME) + # file `***.pdiparams.info` + extra_var_info_path = path + INFER_PARAMS_INFO_SUFFIX with open(extra_var_info_path, 'wb') as f: pickle.dump(extra_var_info, f, protocol=2) -@deprecate_save_load_configs @dygraph_only -def load(model_path, config=None): +def load(path, **configs): """ :api_attr: imperative - Load model saved by :ref:`api_imperative_jit_save` or :ref:`api_fluid_io_save_inference_model` - as :ref:`api_imperative_TranslatedLayer`, then performing inference or fine-tune training. + Load model saved by ``paddle.jit.save`` or ``paddle.static.save_inference_model`` or + paddle 1.x API ``paddle.fluid.io.save_inference_model`` as ``paddle.jit.TranslatedLayer``, + then performing inference or fine-tune training. .. note:: - For some historical reasons, if you load model saved by :ref:`api_fluid_io_save_inference_model`, + If you load model saved by ``paddle.static.save_inference_model`` , there will be the following limitations when using it in fine-tuning: 1. Imperative mode do not support LoDTensor. All original model's feed targets or parametars that depend on LoD are temporarily unavailable. 2. All saved model's feed targets need to be passed into TranslatedLayer's forward function. @@ -982,15 +735,23 @@ def load(model_path, config=None): 4. The parameter's ``trainable`` information is lost and can not be recovered. Args: - model_path (str): The directory path where the model is saved. - config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object that specifies - additional configuration options. Default None. + path (str): The path prefix to load model. The format is ``dirname/file_prefix`` or ``file_prefix``. + **configs (dict, optional): other load configuration options for compatibility. We do not + recommend using these configurations, they may be removed in the future. If not necessary, + DO NOT use them. Default None. + The following options are currently supported: + (1) model_filename (string): The inference model file name of the paddle 1.x + ``save_inference_model`` save format. Default file name is :code:`__model__` . + (2) params_filename (string): The persistable variables file name of the paddle 1.x + ``save_inference_model`` save format. No default file name, save variables separately + by default. + Returns: TranslatedLayer: A Layer object can run saved translated model. Examples: - 1. Load model saved by :ref:`api_imperative_jit_save` then performing inference and fine-tune training. + 1. Load model saved by ``paddle.jit.save`` then performing inference and fine-tune training. .. code-block:: python @@ -1039,10 +800,6 @@ def train(layer, loader, loss_fn, opt): print("Epoch {} batch {}: loss = {}".format( epoch_id, batch_id, np.mean(loss.numpy()))) - # enable dygraph mode - place = paddle.CPUPlace() - paddle.disable_static(place) - # 1. train & save model. # create network @@ -1053,7 +810,6 @@ def train(layer, loader, loss_fn, opt): # create data loader dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) loader = paddle.io.DataLoader(dataset, - places=place, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, @@ -1063,13 +819,13 @@ def train(layer, loader, loss_fn, opt): train(layer, loader, loss_fn, adam) # save - model_path = "linear.example.model" - paddle.jit.save(layer, model_path) + path = "example_model/linear" + paddle.jit.save(layer, path) # 2. load model # load - loaded_layer = paddle.jit.load(model_path) + loaded_layer = paddle.jit.load(path) # inference loaded_layer.eval() @@ -1082,15 +838,17 @@ def train(layer, loader, loss_fn, opt): train(loaded_layer, loader, loss_fn, adam) - 2. Load model saved by :ref:`api_fluid_io_save_inference_model` then performing and fine-tune training. + 2. Load model saved by ``paddle.fluid.io.save_inference_model`` then performing and fine-tune training. .. code-block:: python import numpy as np import paddle import paddle.fluid as fluid + import paddle.static as static import paddle.nn as nn import paddle.optimizer as opt + import paddle.nn.functional as F BATCH_SIZE = 16 BATCH_NUM = 4 @@ -1112,18 +870,18 @@ def __getitem__(self, idx): def __len__(self): return self.num_samples - image = fluid.data(name='image', shape=[None, 784], dtype='float32') - label = fluid.data(name='label', shape=[None, 1], dtype='int64') - pred = fluid.layers.fc(input=image, size=10, act='softmax') - loss = fluid.layers.cross_entropy(input=pred, label=label) - avg_loss = fluid.layers.mean(loss) + image = static.data(name='image', shape=[None, 784], dtype='float32') + label = static.data(name='label', shape=[None, 1], dtype='int64') + pred = static.nn.fc(input=image, size=10, act='softmax') + loss = F.cross_entropy(input=pred, label=label) + avg_loss = paddle.mean(loss) - optimizer = fluid.optimizer.SGD(learning_rate=0.001) + optimizer = paddle.optimizer.SGD(learning_rate=0.001) optimizer.minimize(avg_loss) - place = fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) + place = paddle.CPUPlace() + exe = static.Executor(place) + exe.run(static.default_startup_program()) # create data loader dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) @@ -1138,7 +896,7 @@ def __len__(self): # 1. train and save inference model for data in loader(): exe.run( - fluid.default_main_program(), + static.default_main_program(), feed=data, fetch_list=[avg_loss]) @@ -1179,6 +937,10 @@ def __len__(self): print("Epoch {} batch {}: loss = {}".format( epoch_id, batch_id, np.mean(loss.numpy()))) """ + # 1. construct correct config + config = _parse_load_config(configs) + model_path, config = _build_load_path_and_config(path, config) + return TranslatedLayer._construct(model_path, config) diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 88e24e7e1ea99..3ae6d384be7e3 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -62,10 +62,6 @@ def remove(self): class Layer(core.Layer): """ - :alias_main: paddle.nn.Layer - :alias: paddle.nn.Layer - :old_api: paddle.fluid.dygraph.layers.Layer - Dynamic graph Layer based on OOD, includes the parameters of the layer, the structure of the forward graph and so on. Parameters: @@ -74,16 +70,16 @@ class Layer(core.Layer): can be "my_layer_0.w_n", where "w" is the parameter base name and "n" is an unique suffix auto-generated. If None, prefix name will be snake cased class name. Default: None. - dtype(str or core.VarDesc.VarType, optional): data type of this parameter. + dtype(str, optional): data type of this parameter. If set str, it can be "bool", "float16", "float32", "float64", "int8", "int16", "int32", "int64", "uint8" or "uint16". - Default: ``core.VarDesc.VarType.FP32`` + Default: "float32" Returns: None """ - def __init__(self, name_scope=None, dtype=core.VarDesc.VarType.FP32): + def __init__(self, name_scope=None, dtype="float32"): self.training = True if name_scope is None: name_scope = _convert_camel_to_snake(self.__class__.__name__) @@ -110,6 +106,30 @@ def train(self): Returns: None + + Example:: + .. code-block:: python + + import paddle + + class MyLayer(paddle.nn.Layer): + def __init__(self): + super(MyLayer, self).__init__() + self._linear = paddle.nn.Linear(1, 1) + self._dropout = paddle.nn.Dropout(p=0.5) + + def forward(self, input): + temp = self._linear(input) + temp = self._dropout(temp) + return temp + + x = paddle.randn([10, 1], 'float32') + mylayer = MyLayer() + mylayer.eval() # set mylayer._dropout to eval mode + out = mylayer(x) + mylayer.train() # set mylayer._dropout to train mode + out = mylayer(x) + """ # global setting framework._dygraph_tracer().train_mode() @@ -125,6 +145,29 @@ def eval(self): Returns: None + + Example:: + .. code-block:: python + + import paddle + + class MyLayer(paddle.nn.Layer): + def __init__(self): + super(MyLayer, self).__init__() + self._linear = paddle.nn.Linear(1, 1) + self._dropout = paddle.nn.Dropout(p=0.5) + + def forward(self, input): + temp = self._linear(input) + temp = self._dropout(temp) + return temp + + x = paddle.randn([10, 1], 'float32') + mylayer = MyLayer() + mylayer.eval() # set mylayer._dropout to eval mode + out = mylayer(x) + print(out) + """ # global setting framework._dygraph_tracer().eval_mode() @@ -149,15 +192,13 @@ def apply(self, fn): import paddle import paddle.nn as nn - - paddle.disable_static() - + net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2)) def init_weights(layer): if type(layer) == nn.Linear: print('before init weight:', layer.weight.numpy()) - new_weight = paddle.fill_constant(layer.weight.shape, layer.weight.dtype, value=0.9) + new_weight = paddle.full(shape=layer.weight.shape, dtype=layer.weight.dtype, fill_value=0.9) layer.weight.set_value(new_weight) print('after init weight:', layer.weight.numpy()) @@ -177,6 +218,23 @@ def full_name(self): Returns: str: full name of this layer. + + Example:: + .. code-block:: python + + import paddle + + class LinearNet(paddle.nn.Layer): + def __init__(self): + super(LinearNet, self).__init__(name_scope = "demo_linear_net") + self._linear = paddle.nn.Linear(1, 1) + + def forward(self, x): + return self._linear(x) + + linear_net = LinearNet() + print(linear_net.full_name()) # demo_linear_net_0 + """ return self._full_name @@ -197,34 +255,33 @@ def register_forward_post_hook(self, hook): Examples: .. code-block:: python - import paddle.fluid as fluid - import numpy as np + import paddle + import numpy as np + + # the forward_post_hook change the output of the layer: output = output * 2 + def forward_post_hook(layer, input, output): + # user can use layer, input and output for information statistis tasks - # the forward_post_hook change the output of the layer: output = output * 2 - def forward_post_hook(layer, input, output): - # user can use layer, input and output for information statistis tasks + # change the output + return output * 2 - # change the output - return output * 2 + linear = paddle.nn.Linear(13, 5) - with fluid.dygraph.guard(): - linear = fluid.Linear(13, 5, dtype="float32") + # register the hook + forward_post_hook_handle = linear.register_forward_post_hook(forward_post_hook) - # register the hook - forward_post_hook_handle = linear.register_forward_post_hook(forward_post_hook) - - value1 = np.arange(26).reshape(2, 13).astype("float32") - in1 = fluid.dygraph.to_variable(value1) - - out0 = linear(in1) - - # remove the hook - forward_post_hook_handle.remove() + value1 = np.arange(26).reshape(2, 13).astype("float32") + in1 = paddle.to_tensor(value1) - out1 = linear(in1) + out0 = linear(in1) - # hook change the linear's output to output * 2, so out0 is equal to out1 * 2. - assert (out0.numpy() == (out1.numpy()) * 2).any() + # remove the hook + forward_post_hook_handle.remove() + + out1 = linear(in1) + + # hook change the linear's output to output * 2, so out0 is equal to out1 * 2. + assert (out0.numpy() == (out1.numpy()) * 2).any() """ hook_remove_helper = HookRemoveHelper(self._forward_post_hooks) self._forward_post_hooks[hook_remove_helper._hook_id] = hook @@ -249,36 +306,35 @@ def register_forward_pre_hook(self, hook): Examples: .. code-block:: python - import paddle.fluid as fluid - import numpy as np + import paddle + import numpy as np - # the forward_post_hook change the input of the layer: input = input * 2 - def forward_pre_hook(layer, input): - # user can use layer and input for information statistis tasks + # the forward_post_hook change the input of the layer: input = input * 2 + def forward_pre_hook(layer, input): + # user can use layer and input for information statistis tasks - # change the input - input_return = (input[0] * 2) - return input_return + # change the input + input_return = (input[0] * 2) + return input_return - with fluid.dygraph.guard(): - linear = fluid.Linear(13, 5, dtype="float32") + linear = paddle.nn.Linear(13, 5) - # register the hook - forward_pre_hook_handle = linear.register_forward_pre_hook(forward_pre_hook) + # register the hook + forward_pre_hook_handle = linear.register_forward_pre_hook(forward_pre_hook) - value0 = np.arange(26).reshape(2, 13).astype("float32") - in0 = fluid.dygraph.to_variable(value0) - out0 = linear(in0) + value0 = np.arange(26).reshape(2, 13).astype("float32") + in0 = paddle.to_tensor(value0) + out0 = linear(in0) - # remove the hook - forward_pre_hook_handle.remove() + # remove the hook + forward_pre_hook_handle.remove() - value1 = value0 * 2 - in1 = fluid.dygraph.to_variable(value1) - out1 = linear(in1) + value1 = value0 * 2 + in1 = paddle.to_tensor(value1) + out1 = linear(in1) - # hook change the linear's input to input * 2, so out0 is equal to out1. - assert (out0.numpy() == out1.numpy()).any() + # hook change the linear's input to input * 2, so out0 is equal to out1. + assert (out0.numpy() == out1.numpy()).any() """ hook_remove_helper = HookRemoveHelper(self._forward_pre_hooks) self._forward_pre_hooks[hook_remove_helper._hook_id] = hook @@ -294,17 +350,37 @@ def create_parameter(self, Parameters: shape(list): Shape of the parameter. - attr(ParamAttr, optional): Parameter attribute of weight. Please refer to :ref:`api_fluid_ParamAttr`. Default: None. - dtype(str or core.VarDesc.VarType or str, optional): Data type of this parameter. + attr(ParamAttr, optional): Parameter attribute of weight. Please refer to :ref:`api_paddle_ParamAttr`. Default: None. + dtype(str, optional): Data type of this parameter. If set str, it can be "bool", "float16", "float32", "float64", "int8", "int16", "int32", "int64", "uint8" or "uint16". Default: "float32". is_bias(bool, optional): if this is a bias parameter. Default: False. default_initializer(Initializer, optional): the default initializer for this parameter. - If set None, default initializer will be set to :ref:`api_fluid_initializer_XavierInitializer` and :ref:`api_fluid_initializer_ConstantInitializer` + If set None, default initializer will be set to paddle.nn.initializer.Xavier and paddle.nn.initializer.Constant for non-bias and bias parameter, respectively. Default: None. Returns: - :ref:`api_guide_Variable_en` : created parameter. + :Tensor, created parameter. + + Examples: + .. code-block:: python + + import paddle + + class MyLayer(paddle.nn.Layer): + def __init__(self): + super(MyLayer, self).__init__() + self._linear = paddle.nn.Linear(1, 1) + w_tmp = self.create_parameter([1,1]) + self.add_parameter("w_tmp", w_tmp) + + def forward(self, input): + return self._linear(input) + + mylayer = MyLayer() + for name, param in mylayer.named_parameters(): + print(name, param) # will print w_tmp,_linear.weight,_linear.bias + """ temp_attr = copy.deepcopy(attr) if isinstance(temp_attr, six.string_types) and temp_attr == "": @@ -313,24 +389,40 @@ def create_parameter(self, default_initializer) # TODO: Add more parameter list when we need them - def create_variable(self, - name=None, - persistable=None, - dtype=None, - type=core.VarDesc.VarType.LOD_TENSOR): + def create_variable(self, name=None, persistable=None, dtype=None): """Create Variable for this layer. Parameters: name(str, optional): name of the variable. Please refer to :ref:`api_guide_Name` . Default: None persistable(bool, optional): if set this variable persistable. Default: False - dtype(str or core.VarDesc.VarType, optional): data type of this parameter. + dtype(str, optional): data type of this parameter. If set str, it can be "bool", "float16", "float32", "float64", "int8", "int16", "int32", "int64", "uint8" or "uint16". - If set None, it will be ``core.VarDesc.VarType.FP32``. Default: None - type(core.VarDesc.VarType, optional): type of the variable. No need to set this parameter. Default: ``core.VarDesc.VarType.LOD_TENSOR`` + If set None, it will be "float32". Default: None Returns: - :ref:`api_guide_Variable_en` : created Variable. + Tensor, created Variable. + + Examples: + .. code-block:: python + + import paddle + + class MyLinear(paddle.nn.Layer): + def __init__(self, + in_features, + out_features): + super(MyLinear, self).__init__() + self.linear = paddle.nn.Linear( 10, 10) + + self.back_var = self.create_variable(name = "linear_tmp_0", dtype=self._dtype) + + def forward(self, input): + out = self.linear(input) + paddle.assign( out, self.back_var) + + return out + """ if name is not None: var_name = ".".join([self._full_name, name]) @@ -339,7 +431,10 @@ def create_variable(self, [self._full_name, "_generated_var"])) return self._helper.main_program.current_block().create_var( - name=var_name, persistable=persistable, dtype=dtype, type=type) + name=var_name, + persistable=persistable, + dtype=dtype, + type=core.VarDesc.VarType.LOD_TENSOR) def parameters(self, include_sublayers=True): """Returns a list of all Parameters from current layer and its sub-layers. @@ -348,7 +443,16 @@ def parameters(self, include_sublayers=True): include_sublayers(bool, optional): Whether include the parameters of sublayers. If True, also include the parameters from sublayers. Default: True Returns: - list of :ref:`api_guide_Variable_en` : a list of Parameters. + list of Tensor : a list of Parameters. + + Examples: + .. code-block:: python + + import paddle + + linear = paddle.nn.Linear(1,1) + print(linear.parameters()) # print linear_0.w_0 and linear_0.b_0 + """ ret = [ param @@ -366,16 +470,15 @@ def children(self): Examples: .. code-block:: python - import paddle.fluid as fluid + import paddle - with fluid.dygraph.guard(): - fc1 = fluid.Linear(10, 3) - fc2 = fluid.Linear(3, 10, bias_attr=False) - model = fluid.dygraph.Sequential(fc1, fc2) - - layer_list = list(model.children()) + linear1 = paddle.nn.Linear(10, 3) + linear2 = paddle.nn.Linear(3, 10, bias_attr=False) + model = paddle.nn.Sequential(linear1, linear2) + + layer_list = list(model.children()) - print(layer_list) + print(layer_list) # [, ] """ for _, layer in self.named_children(): @@ -391,14 +494,15 @@ def named_children(self): Examples: .. code-block:: python - import paddle.fluid as fluid + import paddle - with fluid.dygraph.guard(): - fc1 = fluid.Linear(10, 3) - fc2 = fluid.Linear(3, 10, bias_attr=False) - model = fluid.dygraph.Sequential(fc1, fc2) - for prefix, layer in model.named_children(): - print(prefix, layer) + linear1 = paddle.nn.Linear(10, 3) + linear2 = paddle.nn.Linear(3, 10, bias_attr=False) + model = paddle.nn.Sequential(linear1, linear2) + for prefix, layer in model.named_children(): + print(prefix, layer) + # ('0', ) + # ('1', ) """ memo = set() @@ -415,6 +519,26 @@ def sublayers(self, include_sublayers=True): Returns: list of Layer : a list of sub layers. + + Examples: + .. code-block:: python + + import paddle + + class MyLayer(paddle.nn.Layer): + def __init__(self): + super(MyLayer, self).__init__() + self._linear = paddle.nn.Linear(1, 1) + self._dropout = paddle.nn.Dropout(p=0.5) + + def forward(self, input): + temp = self._linear(input) + temp = self._dropout(temp) + return temp + + mylayer = MyLayer() + print(mylayer.sublayers()) # [, ] + """ ret = [ layer @@ -438,14 +562,13 @@ def named_parameters(self, prefix='', include_sublayers=True): Examples: .. code-block:: python - import paddle.fluid as fluid + import paddle - with fluid.dygraph.guard(): - fc1 = fluid.Linear(10, 3) - fc2 = fluid.Linear(3, 10, bias_attr=False) - model = fluid.dygraph.Sequential(fc1, fc2) - for name, param in model.named_parameters(): - print(name, param) + fc1 = paddle.nn.Linear(10, 3) + fc2 = paddle.nn.Linear(3, 10, bias_attr=False) + model = paddle.nn.Sequential(fc1, fc2) + for name, param in model.named_parameters(): + print(name, param) """ params_set = set() @@ -483,14 +606,13 @@ def named_sublayers(self, Examples: .. code-block:: python - import paddle.fluid as fluid + import paddle - with fluid.dygraph.guard(): - fc1 = fluid.Linear(10, 3) - fc2 = fluid.Linear(3, 10, bias_attr=False) - model = fluid.dygraph.Sequential(fc1, fc2) - for prefix, layer in model.named_sublayers(): - print(prefix, layer) + fc1 = paddle.nn.Linear(10, 3) + fc2 = paddle.nn.Linear(3, 10, bias_attr=False) + model = paddle.nn.Sequential(fc1, fc2) + for prefix, layer in model.named_sublayers(): + print(prefix, layer) """ if layers_set is None: @@ -510,11 +632,11 @@ def named_sublayers(self, layers_set=layers_set): yield p, l - def register_buffer(self, name, variable, persistable=True): + def register_buffer(self, name, tensor, persistable=True): """ - Registers a variable as buffer into the layer. + Registers a tensor as buffer into the layer. - `buffer` is a non-parameteric variable and will not be updated by optimizer, + `buffer` is a non-trainable tensor and will not be updated by optimizer, but is necessary for evaluation and inference. For example, the mean and variance in BatchNorm layers. The registered buffer is persistable by default, and will be saved into `state_dict` alongside parameters. If set persistable=False, it registers @@ -525,7 +647,7 @@ def register_buffer(self, name, variable, persistable=True): Parameters: name (string): name of the buffer. The buffer can be accessed from this layer using the given name - variable (Variable): the variable to be registered as buffer. + tensor (Tensor): the tensor to be registered as buffer. persistable (bool): whether the buffer is part of this layer's state_dict. @@ -536,16 +658,15 @@ def register_buffer(self, name, variable, persistable=True): .. code-block:: python import numpy as np - import paddle.fluid as fluid + import paddle - with fluid.dygraph.guard(): - linear = fluid.Linear(10, 3) - value = np.array([0]).astype("float32") - buffer = fluid.dygraph.to_variable(value) - linear.register_buffer("buf_name", buffer, persistable=True) - - # get the buffer by attribute. - print(linear.buf_name) + linear = paddle.nn.Linear(10, 3) + value = np.array([0]).astype("float32") + buffer = paddle.to_tensor(value) + linear.register_buffer("buf_name", buffer, persistable=True) + + # get the buffer by attribute. + print(linear.buf_name) """ @@ -565,12 +686,12 @@ def register_buffer(self, name, variable, persistable=True): raise KeyError("The name of buffer can not be empty.") elif hasattr(self, name) and name not in self._buffers: raise KeyError("attribute '{}' already exists.".format(name)) - elif variable is not None and not type(variable) == core.VarBase: + elif tensor is not None and not type(tensor) == core.VarBase: raise TypeError( "The registered buffer should be a core.VarBase, but received {}.". - format(type(variable).__name__)) + format(type(tensor).__name__)) else: - self._buffers[name] = variable + self._buffers[name] = tensor if persistable: self._non_persistable_buffer_names_set.discard(name) else: @@ -584,7 +705,21 @@ def buffers(self, include_sublayers=True): include_sublayers(bool, optional): Whether include the buffers of sublayers. If True, also include the buffers from sublayers. Default: True Returns: - list of :ref:`api_guide_Variable_en` : a list of buffers. + list of Tensor : a list of buffers. + + Examples: + .. code-block:: python + + import numpy as np + import paddle + + linear = paddle.nn.Linear(10, 3) + value = np.array([0]).astype("float32") + buffer = paddle.to_tensor(value) + linear.register_buffer("buf_name", buffer, persistable=True) + + print(linear.buffers()) # == print([linear.buf_name]) + """ ret = [ buffer @@ -595,7 +730,7 @@ def buffers(self, include_sublayers=True): def named_buffers(self, prefix='', include_sublayers=True): """ - Returns an iterator over all buffers in the Layer, yielding tuple of name and Variable. + Returns an iterator over all buffers in the Layer, yielding tuple of name and Tensor. Parameters: prefix(str, optional): Prefix to prepend to all buffer names. Default: ''. @@ -603,31 +738,30 @@ def named_buffers(self, prefix='', include_sublayers=True): If True, also include the named buffers from sublayers. Default: True. Yields: - (string, Variable): Tuple of name and Variable + (string, Tensor): Tuple of name and tensor Examples: .. code-block:: python import numpy as np - import paddle.fluid as fluid + import paddle - with fluid.dygraph.guard(): - fc1 = fluid.Linear(10, 3) - buffer1 = fluid.dygraph.to_variable(np.array([0]).astype("float32")) - # register a variable as buffer by specific `persistable` - fc1.register_buffer("buf_name_1", buffer1, persistable=True) + fc1 = paddle.nn.Linear(10, 3) + buffer1 = paddle.to_tensor(np.array([0]).astype("float32")) + # register a tensor as buffer by specific `persistable` + fc1.register_buffer("buf_name_1", buffer1, persistable=True) - fc2 = fluid.Linear(3, 10) - buffer2 = fluid.dygraph.to_variable(np.array([1]).astype("float32")) - # register a buffer by assigning an attribute with Variable. - # The `persistable` can only be False by this way. - fc2.buf_name_2 = buffer2 + fc2 = paddle.nn.Linear(3, 10) + buffer2 = paddle.to_tensor(np.array([1]).astype("float32")) + # register a buffer by assigning an attribute with Tensor. + # The `persistable` can only be False by this way. + fc2.buf_name_2 = buffer2 - model = fluid.dygraph.Sequential(fc1, fc2) + model = paddle.nn.Sequential(fc1, fc2) - # get all named buffers - for name, buffer in model.named_buffers(): - print(name, buffer) + # get all named buffers + for name, buffer in model.named_buffers(): + print(name, buffer) """ buffers_set = set() @@ -654,19 +788,18 @@ def clear_gradients(self): Examples: .. code-block:: python - import paddle.fluid as fluid + import paddle import numpy as np - with fluid.dygraph.guard(): - value = np.arange(26).reshape(2, 13).astype("float32") - a = fluid.dygraph.to_variable(value) - linear = fluid.Linear(13, 5, dtype="float32") - adam = fluid.optimizer.Adam(learning_rate=0.01, - parameter_list=linear.parameters()) - out = linear(a) - out.backward() - adam.minimize(out) - linear.clear_gradients() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_tensor(value) + linear = paddle.nn.Linear(13, 5) + adam = paddle.optimizer.Adam(learning_rate=0.01, + parameters=linear.parameters()) + out = linear(a) + out.backward() + adam.step() + linear.clear_gradients() """ for p in self.parameters(): @@ -726,6 +859,32 @@ def add_sublayer(self, name, sublayer): sublayer(Layer): an instance of Layer. Returns: Layer: the sublayer passed in. + + Examples: + .. code-block:: python + + import paddle + + class MySequential(paddle.nn.Layer): + def __init__(self, *layers): + super(MySequential, self).__init__() + if len(layers) > 0 and isinstance(layers[0], tuple): + for name, layer in layers: + self.add_sublayer(name, layer) + else: + for idx, layer in enumerate(layers): + self.add_sublayer(str(idx), layer) + + def forward(self, input): + for layer in self._sub_layers.values(): + input = layer(input) + return input + + fc1 = paddle.nn.Linear(10, 3) + fc2 = paddle.nn.Linear(3, 10, bias_attr=False) + model = MySequential(fc1, fc2) + for prefix, layer in model.named_sublayers(): + print(prefix, layer) """ assert isinstance(sublayer, core.Layer) @@ -742,6 +901,25 @@ def add_parameter(self, name, parameter): parameter(Parameter): an instance of Parameter. Returns: Parameter: the parameter passed in. + Examples: + .. code-block:: python + + import paddle + + class MyLayer(paddle.nn.Layer): + def __init__(self): + super(MyLayer, self).__init__() + self._linear = paddle.nn.Linear(1, 1) + w_tmp = self.create_parameter([1,1]) + self.add_parameter("w_tmp", w_tmp) + + def forward(self, input): + return self._linear(input) + + mylayer = MyLayer() + for name, param in mylayer.named_parameters(): + print(name, param) # will print w_tmp,_linear.weight,_linear.bias + """ if '_parameters' not in self.__dict__: raise RuntimeError( @@ -871,24 +1049,23 @@ def __dir__(self): Return a list. Get all parameters, buffers(non-parameter variables), sublayers, method and attr of Layer. Examples: - import paddle.fluid as fluid - import numpy as np - - fluid.dygraph.enable_dygraph() + .. code-block:: python + import paddle + import numpy as np - class Mylayer(fluid.dygraph.Layer): - def __init__(self): - super(Mylayer, self).__init__() - self.linear1 = fluid.dygraph.Linear(10, 10) - self.linear2 = fluid.dygraph.Linear(5, 5) - self.conv2d = fluid.dygraph.Conv2D(3, 2, 3) - self.embedding = fluid.dygraph.Embedding(size=[128, 16]) - self.h_0 = fluid.dygraph.to_variable(np.zeros([10, 10]).astype('float32')) + class Mylayer(paddle.nn.Layer): + def __init__(self): + super(Mylayer, self).__init__() + self.linear1 = paddle.nn.Linear(10, 10) + self.linear2 = paddle.nn.Linear(5, 5) + self.conv2d = paddle.nn.Conv2d(3, 2, 3) + self.embedding = paddle.nn.Embedding(128, 16) + self.h_0 = paddle.to_tensor(np.zeros([10, 10]).astype('float32')) - mylayer = Mylayer() - print(dir(mylayer)) - # only parts are shown, because of list have too much content - # ['__call__', '__class__', ... , 'conv2d', 'embedding', 'h_0', 'linear1', 'linear2', ... , 'sublayers', 'train'] + mylayer = Mylayer() + print(dir(mylayer)) + # only parts are shown, because of list have too much content + # ['__call__', '__class__', ... , 'conv2d', 'embedding', 'h_0', 'linear1', 'linear2', ... , 'sublayers', 'train'] """ method = dir(self.__class__) @@ -918,12 +1095,12 @@ def state_dict(self, Examples: .. code-block:: python - import paddle.fluid as fluid - with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding([10, 10]) + import paddle - state_dict = emb.state_dict() - fluid.save_dygraph( state_dict, "paddle_dy") + emb = paddle.nn.Embedding(10, 10) + + state_dict = emb.state_dict() + paddle.save( state_dict, "paddle_dy.pdparams") ''' @@ -967,16 +1144,12 @@ def set_state_dict(self, .. code-block:: python import paddle - - paddle.disable_static() - + emb = paddle.nn.Embedding(10, 10) state_dict = emb.state_dict() paddle.save(state_dict, "paddle_dy.pdparams") - para_state_dict = paddle.load("paddle_dy.pdparams") - emb.set_state_dict(para_state_dict) ''' diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 05269028acc40..1a488844dec21 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -895,9 +895,6 @@ def forward(self, input): class Linear(layers.Layer): """ - :alias_main: paddle.nn.Linear - :alias: paddle.nn.Linear,paddle.nn.layer.Linear,paddle.nn.layer.common.Linear - :old_api: paddle.fluid.dygraph.Linear Fully-connected linear transformation layer: diff --git a/python/paddle/fluid/dygraph/static_runner.py b/python/paddle/fluid/dygraph/static_runner.py index d482077cd4f2a..e8738da07e993 100644 --- a/python/paddle/fluid/dygraph/static_runner.py +++ b/python/paddle/fluid/dygraph/static_runner.py @@ -14,7 +14,7 @@ from __future__ import print_function -from paddle.fluid.dygraph.jit import SaveLoadConfig +from paddle.fluid.dygraph.jit import _SaveLoadConfig from paddle.fluid.dygraph.io import TranslatedLayer @@ -31,7 +31,7 @@ class StaticModelRunner(object): """ def __new__(cls, model_dir, model_filename=None, params_filename=None): - configs = SaveLoadConfig() + configs = _SaveLoadConfig() if model_filename is not None: configs.model_filename = model_filename if params_filename is not None: diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 3dc30767e5aa4..f5660c3fc91a1 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -54,11 +54,11 @@ def global_scope(): Examples: .. code-block:: python - import paddle.fluid as fluid + import paddle import numpy - fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace()) - numpy.array(fluid.global_scope().find_var("data").get_tensor()) + paddle.static.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), paddle.CPUPlace()) + numpy.array(paddle.static.global_scope().find_var("data").get_tensor()) """ return g_scope @@ -94,12 +94,13 @@ def scope_guard(scope): Examples: .. code-block:: python - import paddle.fluid as fluid + import paddle import numpy + paddle.enable_static() - new_scope = fluid.Scope() - with fluid.scope_guard(new_scope): - fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace()) + new_scope = paddle.static.Scope() + with paddle.static.scope_guard(new_scope): + paddle.static.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), paddle.CPUPlace()) numpy.array(new_scope.find_var("data").get_tensor()) """ diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 61ffb60b1105d..52c1e5d5e16c1 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -380,31 +380,35 @@ def cuda_places(device_ids=None): For multi-card tasks, please use `FLAGS_selected_gpus` environment variable to set the visible GPU device. The next version will fix the problem with `CUDA_VISIBLE_DEVICES` environment variable. - This function creates a list of :code:`fluid.CUDAPlace` objects. + This function creates a list of :code:`paddle.CUDAPlace` objects. If :code:`device_ids` is None, environment variable of :code:`FLAGS_selected_gpus` would be checked first. For example, if :code:`FLAGS_selected_gpus=0,1,2`, the returned list would - be [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)]. + be [paddle.CUDAPlace(0), paddle.CUDAPlace(1), paddle.CUDAPlace(2)]. If :code:`FLAGS_selected_gpus` is not set, all visible gpu places would be returned according to the :code:`CUDA_VISIBLE_DEVICES` environment variable. If :code:`device_ids` is not None, it should be the device ids of GPUs. For example, if :code:`device_ids=[0,1,2]`, the returned list would be - [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)]. + [paddle.CUDAPlace(0), paddle.CUDAPlace(1), paddle.CUDAPlace(2)]. Parameters: device_ids (list or tuple of int, optional): list of GPU device ids. Returns: - list of fluid.CUDAPlace: Created GPU place list. + list of paddle.CUDAPlace: Created GPU place list. Examples: .. code-block:: python - import paddle.fluid as fluid - cuda_places = fluid.cuda_places() + import paddle + import paddle.static as static + + paddle.enable_static() + + cuda_places = static.cuda_places() """ assert core.is_compiled_with_cuda(), \ @@ -418,7 +422,7 @@ def cuda_places(device_ids=None): def cpu_places(device_count=None): """ - This function creates a list of :code:`fluid.CPUPlace` objects, and returns the created list. + This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list. If :code:`device_count` is None, the device count would be determined by environment variable :code:`CPU_NUM`. @@ -431,13 +435,17 @@ def cpu_places(device_count=None): device_count (int, optional): device number. Default: None. Returns: - list of fluid.CPUPlace: Created list of CPU places. + list of paddle.CPUPlace: Created list of CPU places. Examples: .. code-block:: python - import paddle.fluid as fluid - cpu_places = fluid.cpu_places() + import paddle + import paddle.static as static + + paddle.enable_static() + + cpu_places = static.cpu_places() """ if device_count is None: @@ -5115,6 +5123,8 @@ class Parameter(Variable): be applied on the parameter. Default: None do_model_average(bool): True if the model average strategy will be applied on this parameter. + need_clip (bool): Whether the parameter gradient need to be cliped + in optimizer. Default is True. """ def __init__(self, @@ -5154,6 +5164,8 @@ def __init__(self, self.do_model_average = kwargs.get('do_model_average', None) + self.need_clip = kwargs.get('need_clip', True) + self.is_distributed = False def __str__(self): @@ -5186,7 +5198,7 @@ def to_string(self, throw_on_error, with_details=False): if with_details: res_str = Variable.to_string(self, throw_on_error, True) additional_attr = ("trainable", "optimize_attr", "regularizer", - "do_model_average") + "do_model_average", "need_clip") for attr_name in additional_attr: res_str += "%s: %s\n" % (attr_name, cpt.to_text(getattr(self, attr_name))) @@ -5218,6 +5230,8 @@ class ParamBase(core.VarBase): be applied on the ParamBase. Default: None do_model_average(bool): True if the model average strategy will be applied on this ParamBase. + need_clip (bool): Whether the parameter gradient need to be cliped + in optimizer. Default is True. """ @dygraph_only @@ -5257,6 +5271,8 @@ def __init__(self, shape, dtype, **kwargs): self.do_model_average = kwargs.get('do_model_average', None) + self.need_clip = kwargs.get('need_clip', True) + self.is_distributed = False # self.block = default_main_program().global_block() diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py index e348c67ae0461..90847382c86e1 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py @@ -133,6 +133,8 @@ def __init__(self, main_program, startup_program, strategy, role_maker): self.origin_main_program = main_program self.origin_startup_program = startup_program + self.origin_ps_main_program = main_program + self.origin_ps_startup_program = startup_program self.strategy = strategy self.role_maker = role_maker @@ -153,6 +155,11 @@ def __init__(self, main_program, startup_program, strategy, role_maker): self._build_var_distributed() + # for heter-ps save variables + self.origin_merged_variables_pairs = list(self.merged_variables_pairs) + self.origin_merged_dense_pairs = list(self.merged_dense_pairs) + self.origin_merged_sparse_pairs = list(self.merged_sparse_pairs) + def get_distributed_mode(self): trainer = self.strategy.get_trainer_runtime_config() return trainer.mode @@ -214,6 +221,18 @@ def get_origin_main_program(self): def get_origin_startup_program(self): return self.origin_startup_program + def set_origin_ps_main_program(self, program): + self.origin_ps_main_program = program + + def set_origin_ps_startup_program(self, program): + self.origin_ps_startup_program = program + + def get_origin_ps_main_program(self): + return self.origin_ps_main_program + + def get_origin_ps_startup_program(self): + return self.origin_ps_startup_program + def get_sparse_varname_on_ps(self, is_distributed, endpoint=None): if not endpoint: endpoint = self.get_ps_endpoint() @@ -378,7 +397,9 @@ def get_communicator_send_context(self): send_ctx[name] = ctx return send_ctx - def get_communicator_recv_context(self, recv_type=1): + def get_communicator_recv_context(self, + recv_type=1, + use_origin_program=False): # recv_type # 1 : DENSE 2. SPARSE 3. DISTRIBUTED 4. ALL distibuted_varnames = get_sparse_tablenames(self.origin_main_program, @@ -392,7 +413,8 @@ def get_communicator_recv_context(self, recv_type=1): sparse_recv_ctx = {} distributed_recv_ctx = {} - for merged in self.merged_variables_pairs: + variables_pairs = self.merged_variables_pairs if not use_origin_program else self.origin_merged_variables_pairs + for merged in variables_pairs: params = merged[0] if params.merged_var.name in sparse_varnames: continue diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 7a92adf0a89dc..67c572d4988ce 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -729,31 +729,32 @@ class BilinearInitializer(Initializer): .. code-block:: python - import paddle.fluid as fluid import math + + import paddle + import paddle.nn as nn + from paddle.regularizer import L2Decay + factor = 2 C = 2 B = 8 H = W = 32 - w_attr = fluid.param_attr.ParamAttr( - learning_rate=0., - regularizer=fluid.regularizer.L2Decay(0.), - initializer=fluid.initializer.Bilinear()) - x = fluid.data(name="data", shape=[B, 3, H, W], - dtype="float32") - conv_up = fluid.layers.conv2d_transpose( - input=x, - num_filters=C, - output_size=None, - filter_size=2 * factor - factor % 2, - padding=int(math.ceil((factor - 1) / 2.)), - stride=factor, - groups=C, - param_attr=w_attr, - bias_attr=False) - - Where, `num_filters=C` and `groups=C` means this is channel-wise transposed - convolution. The filter shape will be (C, 1, K, K) where K is `filer_size`, + w_attr = paddle.ParamAttr(learning_rate=0., + regularizer=L2Decay(0.), + initializer=nn.initializer.Bilinear()) + data = paddle.rand([B, 3, H, W], dtype='float32') + conv_up = nn.ConvTranspose2d(3, + out_channels=C, + kernel_size=2 * factor - factor % 2, + padding=int( + math.ceil((factor - 1) / 2.)), + stride=factor, + weight_attr=w_attr, + bias_attr=False) + x = conv_up(data) + + Where, `out_channels=C` and `groups=C` means this is channel-wise transposed + convolution. The filter shape will be (C, 1, K, K) where K is `kernel_size`, This initializer will set a (K, K) interpolation kernel for every channel of the filter identically. The resulting shape of the output feature map will be (B, C, factor * H, factor * W). Note that the learning rate and the diff --git a/python/paddle/fluid/input.py b/python/paddle/fluid/input.py index 529588c0846b5..0e3ee46fa46d1 100644 --- a/python/paddle/fluid/input.py +++ b/python/paddle/fluid/input.py @@ -220,24 +220,96 @@ def embedding(input, Returns: Variable: Embedding Tensor or LoDTensor mapped by input. The data type is the same as :attr:`dtype` . - Examples: + Static Examples: + .. code-block:: python + + import paddle + import numpy as np + paddle.enable_static() + + x = paddle.static.data(name="x", shape = [2, 4], dtype=np.int64) + embedding = paddle.nn.Embedding(10, 3, + weight_attr=paddle.nn.initializer.Constant(value=1.0)) + adam = paddle.optimizer.SGD(parameters=[embedding.weight], learning_rate=0.01) + output = embedding(x) + m_output=paddle.mean(output) + + adam.minimize(m_output) + + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(paddle.static.default_startup_program()) + + x = np.array([[7, 2, 4, 5],[4, 3, 2, 9]], dtype=np.int64) + + # x is a Numpy. + # x.data = [[7, 2, 4, 5], [4, 3, 2, 9]] + # x.shape = [2, 4] + + out, = exe.run(paddle.static.default_main_program(), feed={'x':x}, fetch_list=[output]) + + # out is a Numpy. + # out.data = [[1., 1., 1.], + # [1., 1., 1.], + # [1., 1., 1.], + # [1., 1., 1.]], + # + # [[1., 1., 1.], + # [1., 1., 1.], + # [1., 1., 1.], + # [0., 0., 0.]]] + # out.shape = [2, 4, 3] + + + Dygraph Examples: .. code-block:: python - import paddle.fluid as fluid - import numpy as np - data = fluid.data(name='x', shape=[None, 10], dtype='int64') - - # example 1 - emb_1 = fluid.embedding(input=data, size=[128, 64]) - - # example 2: load custom or pre-trained word vectors - weight_data = np.random.random(size=(128, 100)) # word vectors with numpy format - w_param_attrs = fluid.ParamAttr( - name="emb_weight", - learning_rate=0.5, - initializer=fluid.initializer.NumpyArrayInitializer(weight_data), - trainable=True) - emb_2 = fluid.embedding(input=data, size=(128, 100), param_attr=w_param_attrs, dtype='float32') + import paddle + import numpy as np + + paddle.disable_static() + + x_data = np.arange(3, 6).reshape((3, 1)).astype(np.int64) + + # x is a Tensor. + # x.data = [[3], [4], [5]] + # x.shape = [3, 1] + x = paddle.to_tensor(x_data, stop_gradient=False) + + # embedding weight shape = [10, 3] + embedding = paddle.nn.Embedding(10, 3, sparse=True) + + # embedding weight data = [10, 3] + w0 = np.full(shape=(10, 3), fill_value=2).astype(np.float32) + + # embedding.weight.shape = [10, 3] + # embedding.weight.data = + # [[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]] + embedding.weight.set_value(w0) + + adam = paddle.optimizer.Adam( + parameters=[embedding.weight], learning_rate=0.01) + adam.clear_grad() + + # out is Tensor + # out.shape: [3, 1, 3] + # out.layout: NCHW + # out.dtype: float + # out.data: [2 2 2 2 2 2 2 2 2] + out = embedding(x) + + out.backward() + adam.step() + """ helper = LayerHelper('embedding', **locals()) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index fe5b683bdeaa3..bb55aeb70d1f2 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -1346,7 +1346,7 @@ def save_inference_model(dirname, append_fetch_ops(main_program, fetch_var_names) main_program.desc._set_version() - paddle.fluid.core.save_op_compatible_info(main_program.desc) + paddle.fluid.core.save_op_version_info(main_program.desc) with open(model_basename, "wb") as f: f.write(main_program.desc.serialize_to_string()) else: @@ -1720,7 +1720,7 @@ def get_tensor(var): main_program = program.clone() program.desc.flush() main_program.desc._set_version() - paddle.fluid.core.save_op_compatible_info(program.desc) + paddle.fluid.core.save_op_version_info(program.desc) with open(model_path + ".pdmodel", "wb") as f: f.write(program.desc.serialize_to_string()) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 411ac6e51b1c8..0c77917c78190 100755 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -2297,11 +2297,6 @@ def copy_var_to_parent_block(var, layer_helper): def cond(pred, true_fn=None, false_fn=None, name=None): """ - :api_attr: Static Graph - :alias_main: paddle.nn.cond - :alias: paddle.nn.cond,paddle.nn.control_flow.cond - :old_api: paddle.fluid.layers.cond - This API returns ``true_fn()`` if the predicate ``pred`` is true else ``false_fn()`` . Users could also set ``true_fn`` or ``false_fn`` to ``None`` if do nothing and this API will treat the callable simply returns @@ -2323,17 +2318,18 @@ def cond(pred, true_fn=None, false_fn=None, name=None): semantics. For example: .. code-block:: python - - import paddle.fluid as fluid - a = fluid.data(name='a', shape=[-1, 1], dtype='float32') - b = fluid.data(name='b', shape=[-1, 1], dtype='float32') + + import paddle + + a = paddle.zeros((1, 1)) + b = paddle.zeros((1, 1)) c = a * b - out = fluid.layers.cond(a < b, lambda: a + c, lambda: b * b) + out = paddle.nn.cond(a < b, lambda: a + c, lambda: b * b) No matter whether ``a < b`` , ``c = a * b`` will run. Args: - pred(Variable): A boolean tensor whose numel should be 1. The boolean + pred(Tensor): A boolean tensor whose numel should be 1. The boolean value determines whether to return the result of ``true_fn`` or ``false_fn`` . true_fn(callable, optional): A callable to be performed if ``pred`` is @@ -2345,7 +2341,7 @@ def cond(pred, true_fn=None, false_fn=None, name=None): refer to :ref:`api_guide_Name` . Returns: - Variable|list(Variable)|tuple(Variable): returns ``true_fn()`` if the + Tensor|list(Tensor)|tuple(Tensor): returns ``true_fn()`` if the predicate ``pred`` is true else ``false_fn()`` . Raises: @@ -2356,10 +2352,7 @@ def cond(pred, true_fn=None, false_fn=None, name=None): Examples: .. code-block:: python - import paddle.fluid as fluid - import paddle.fluid.layers as layers - from paddle.fluid.executor import Executor - from paddle.fluid.framework import Program, program_guard + import paddle # # pseudocode: @@ -2369,32 +2362,28 @@ def cond(pred, true_fn=None, false_fn=None, name=None): # return 3, 2 # + def true_func(): - return layers.fill_constant( - shape=[1, 2], dtype='int32', value=1), layers.fill_constant( - shape=[2, 3], dtype='bool', value=True) + return paddle.fill_constant(shape=[1, 2], dtype='int32', + value=1), paddle.fill_constant(shape=[2, 3], + dtype='bool', + value=True) + def false_func(): - return layers.fill_constant( - shape=[3, 4], dtype='float32', value=3), layers.fill_constant( - shape=[4, 5], dtype='int64', value=2) - - main_program = Program() - startup_program = Program() - with program_guard(main_program, startup_program): - x = layers.fill_constant(shape=[1], dtype='float32', value=0.1) - y = layers.fill_constant(shape=[1], dtype='float32', value=0.23) - pred = layers.less_than(x, y) - out = layers.cond(pred, true_func, false_func) - # out is a tuple containing 2 tensors - - place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( - ) else fluid.CPUPlace() - exe = fluid.Executor(place) - ret = exe.run(main_program, fetch_list=out) + return paddle.fill_constant(shape=[3, 4], dtype='float32', + value=3), paddle.fill_constant(shape=[4, 5], + dtype='int64', + value=2) + + x = paddle.fill_constant(shape=[1], dtype='float32', value=0.1) + y = paddle.fill_constant(shape=[1], dtype='float32', value=0.23) + pred = paddle.less_than(x=x, y=y, name=None) + ret = paddle.nn.cond(pred, true_func, false_func) + # ret is a tuple containing 2 tensors # ret[0] = [[1 1]] # ret[1] = [[ True True True] - # [ True True True]] + # [ True True True]] """ if in_dygraph_mode(): diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py index 3610efdd505bd..2b1449a94e6e5 100644 --- a/python/paddle/fluid/layers/loss.py +++ b/python/paddle/fluid/layers/loss.py @@ -1681,11 +1681,6 @@ def kldiv_loss(x, target, reduction='mean', name=None): def npair_loss(anchor, positive, labels, l2_reg=0.002): ''' - :alias_main: paddle.nn.functional.npair_loss - :alias: paddle.nn.functional.npair_loss,paddle.nn.functional.loss.npair_loss - :old_api: paddle.fluid.layers.npair_loss - - **Npair Loss Layer** Read `Improved Deep Metric Learning with Multi class N pair Loss Objective\