Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
Browse files Browse the repository at this point in the history
… zyf_slice
  • Loading branch information
zyfncg committed Aug 31, 2021
2 parents 6666c94 + 561841d commit 29c3e5b
Show file tree
Hide file tree
Showing 87 changed files with 6,460 additions and 1,108 deletions.
6 changes: 3 additions & 3 deletions paddle/fluid/framework/details/build_strategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,15 @@ struct BuildStrategy {
// Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients
// should not be sparse types
paddle::optional<bool> fuse_all_optimizer_ops_{false};
paddle::optional<bool> fuse_all_reduce_ops_{boost::none};
paddle::optional<bool> fuse_all_reduce_ops_{paddle::none};
// fuse_relu_depthwise_conv can fuse the `relu ->
// depthwise_conv`
bool fuse_relu_depthwise_conv_{false};
// NOTE(zcd): In reduce mode, fusing broadcast ops may make the program
// faster. Because fusing broadcast OP equals delaying the execution of all
// broadcast Ops, in this case, all nccl streams are used only for reduce
// operations for a period of time.
paddle::optional<bool> fuse_broadcast_ops_{boost::none};
paddle::optional<bool> fuse_broadcast_ops_{paddle::none};
// replace batch_norm with sync_batch_norm.
bool sync_batch_norm_{false};

Expand All @@ -135,7 +135,7 @@ struct BuildStrategy {
// By default, memory_optimize would be opened if gc is disabled, and
// be closed if gc is enabled.
// Users can forcely enable/disable memory_optimize by setting True/False.
paddle::optional<bool> memory_optimize_{boost::none};
paddle::optional<bool> memory_optimize_{paddle::none};

// Turn on inplace by default.
bool enable_inplace_{true};
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/framework/ir/graph_pattern_detector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2249,9 +2249,9 @@ PDNode *patterns::MultipleQuantize::operator()() {
PDNode *patterns::QuantizePlacement::operator()(
const std::unordered_set<std::string> &quantize_enabled_op_types) {
std::unordered_set<std::string> supported_op_types =
std::unordered_set<std::string>({"concat", "conv2d", "elementwise_add",
"fc", "matmul", "pool2d", "prior_box",
"reshape2", "transpose2", "fusion_gru"});
std::unordered_set<std::string>(
{"concat", "conv2d", "elementwise_add", "fc", "matmul", "pool2d",
"prior_box", "reshape2", "transpose2", "fusion_gru", "multi_gru"});
if (!quantize_enabled_op_types.empty()) {
supported_op_types = quantize_enabled_op_types;
}
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -832,7 +832,7 @@ void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const {
GET_IR_NODE_FROM_SUBGRAPH(weight_x, weight_x, pattern);
GET_IR_NODE_FROM_SUBGRAPH(out, out, pattern);

if (!AreScalesPresentForNodes({x, weight_h, weight_x})) {
if (!AreScalesPresentForNodes({x, weight_x})) {
LogCannotQuantizeOp(op);
return;
}
Expand Down
33 changes: 26 additions & 7 deletions paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -758,7 +758,9 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
Node* mul1_out, Node* mul2_out, Node* mul0_w, Node* mul1_w, Node* mul2_w,
Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, Node* eltadd_qk_b,
Node* reshape2, Node* reshape2_qkv_out, Node* scale, Node* scale_out) {
Node* reshape2, Node* reshape2_qkv_out, Node* scale, Node* scale_out,
Node* softmax_qk, Node* eltadd0, Node* eltadd1, Node* eltadd2,
Node* matmul_qk) {
auto scale_attr = BOOST_GET_CONST(float, scale->Op()->GetAttr("scale"));

// mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H)
Expand Down Expand Up @@ -876,19 +878,35 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
weight_max = std::max(weight_max, weight_scale2);
multihead_op_desc.SetAttr("weight_scale", weight_max);

if (mul0_op_desc->HasAttr("out_threshold")) {
auto* add0_op_desc = eltadd0->Op();
auto* add1_op_desc = eltadd1->Op();
auto* add2_op_desc = eltadd2->Op();
if (add0_op_desc->HasAttr("out_threshold")) {
auto out_scale0 =
BOOST_GET_CONST(float, mul0_op_desc->GetAttr("out_threshold"));
BOOST_GET_CONST(float, add0_op_desc->GetAttr("out_threshold"));
auto out_scale1 =
BOOST_GET_CONST(float, mul1_op_desc->GetAttr("out_threshold"));
BOOST_GET_CONST(float, add1_op_desc->GetAttr("out_threshold"));
auto out_scale2 =
BOOST_GET_CONST(float, mul2_op_desc->GetAttr("out_threshold"));
BOOST_GET_CONST(float, add2_op_desc->GetAttr("out_threshold"));
auto out_scale_max = std::max(out_scale0, out_scale1);
out_scale_max = std::max(out_scale_max, out_scale2);
multihead_op_desc.SetAttr("out_threshold", out_scale_max);
multihead_op_desc.SetAttr("fc_out_threshold", out_scale_max);
}
}

auto* softmax_qk_op_desc = softmax_qk->Op();
auto* matmul_qk_op_desc = matmul_qk->Op();
if (matmul_qk_op_desc->HasAttr("X_scale")) {
multihead_op_desc.SetAttr("qkv2context_plugin_int8", true);
if (softmax_qk_op_desc->HasAttr("out_threshold")) {
auto qkv_plugin_scale = BOOST_GET_CONST(
float, softmax_qk_op_desc->GetAttr("out_threshold"));
multihead_op_desc.SetAttr("dp_probs", qkv_plugin_scale);
}
} else {
multihead_op_desc.SetAttr("qkv2context_plugin_int8", false);
}

auto* multihead = graph->CreateOpNode(&multihead_op_desc);

IR_NODE_LINK_TO(input0, multihead);
Expand Down Expand Up @@ -990,7 +1008,8 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
}
fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w,
mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b,
reshape2_0, reshape2_qkv_out, scale, scale_out);
reshape2_0, reshape2_qkv_out, scale, scale_out, softmax_qk,
eltadd0, eltadd1, eltadd2, matmul_qk);

std::unordered_set<const Node*> marked_nodes({eltadd0,
eltadd1,
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/new_executor/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ cc_library(workqueue SRCS workqueue.cc)
cc_library(interpretercore SRCS interpretercore.cc DEPS op_registry
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor workqueue device_event device_event_gpu)
graph_to_program_pass variable_helper timer monitor workqueue ${DEVICE_EVENT_LIBS})
cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore)
cc_test(workqueue_test SRCS workqueue_test.cc DEPS workqueue)
# cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
76 changes: 28 additions & 48 deletions paddle/fluid/framework/new_executor/interpretercore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/new_executor/interpretercore.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/new_executor/interpretercore_gc_helper.h"

#if defined(PADDLE_WITH_CUDA)
using ::paddle::platform::kCUDA;
USE_EVENT(kCUDA);
#endif

#include <unordered_set>

#include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/new_executor/interpretercore_gc_helper.h"

namespace paddle {
namespace framework {

Expand Down Expand Up @@ -74,27 +70,26 @@ std::vector<size_t> ParseEventVarIds(const Instruction& cur_instr,
}

void AssociateInputWithEvents(
const std::vector<size_t>& new_event_var_id, Instruction* next_instr,
std::map<size_t, std::shared_ptr<platform::CudaEvent>>* var_id2event,
const platform::Place& place, const std::vector<size_t>& new_event_var_id,
Instruction* next_instr,
std::map<size_t, std::shared_ptr<platform::DeviceEvent>>* var_id2event,
bool is_sync) {
#ifdef PADDLE_WITH_CUDA
for (auto var_id : new_event_var_id) {
if (var_id2event->count(var_id) == 0) {
auto cuda_event = std::make_shared<platform::CudaEvent>(
platform::get_cuda_flags(false, false, false));
var_id2event->emplace(var_id, std::move(cuda_event));
auto device_event = std::make_shared<platform::DeviceEvent>(
place, platform::GenerateDeviceEventFlag());
var_id2event->emplace(var_id, std::move(device_event));
}
// Add events for next_instr.inputs
next_instr->intput_events_.emplace_back(var_id, var_id2event->at(var_id),
is_sync);
}
#endif
}

void ParseDirectAndEventRunOps(
const std::vector<OpFuncNode>& op_func_nodes,
const platform::Place& place, const std::vector<OpFuncNode>& op_func_nodes,
const std::vector<size_t>& downstream_ops, size_t op_index,
std::map<size_t, std::shared_ptr<platform::CudaEvent>>* var_id2event,
std::map<size_t, std::shared_ptr<platform::DeviceEvent>>* var_id2event,
std::vector<Instruction>* instructions) {
auto& op_func_type = op_func_nodes[op_index].type_;
auto& cur_instr = instructions->at(op_index);
Expand All @@ -119,24 +114,22 @@ void ParseDirectAndEventRunOps(

bool is_sync =
(op_func_nodes[next_op_id].type_ == OpFuncType::kQueueSync);
AssociateInputWithEvents(new_event_var_ids, &next_instr, var_id2event,
is_sync);
AssociateInputWithEvents(place, new_event_var_ids, &next_instr,
var_id2event, is_sync);

if (is_sync) { // GPU -> CPU
next_instruction.synchronize_run_.emplace_back(next_op_id);
} else { // GPU -> GPU(different stream)
next_instruction.event_wait_run_.emplace_back(next_op_id);
}
}
#ifdef PADDLE_WITH_CUDA
// Create events for these cross-stream vars
VLOG(3) << cur_instr.kernel_func_.operator_base_->Type()
<< " event_var_ids.size: " << event_var_ids.size();
for (auto var_id : event_var_ids) {
cur_instr.output_events_.emplace_back(var_id, var_id2event->at(var_id),
false /*not used*/);
}
#endif
}
}
} // namespace
Expand Down Expand Up @@ -263,12 +256,10 @@ void InterpreterCore::Convert() {
}

for (size_t i = 0; i < vec_instruction_.size(); ++i) {
#if defined(PADDLE_WITH_CUDA)
int device_type = static_cast<int>(paddle::platform::DeviceType::CUDA);
paddle::platform::DeviceOption dev_opt(
device_type, BOOST_GET_CONST(platform::CUDAPlace, place_).device);
gc_event_.emplace_back(dev_opt);
#endif
// int device_type = static_cast<int>(paddle::platform::DeviceType::CUDA);
// paddle::platform::DeviceOption dev_opt(
// device_type, BOOST_GET_CONST(platform::CUDAPlace, place_).device);
gc_event_.emplace_back(place_);

std::vector<size_t> vec_temp;
for (auto& item : vec_instruction_[i].output_index_) {
Expand All @@ -287,8 +278,8 @@ void InterpreterCore::Convert() {
}
}

ParseDirectAndEventRunOps(vec_func_list_, filter_next, i, &var_id2event_,
&vec_instruction_);
ParseDirectAndEventRunOps(place_, vec_func_list_, filter_next, i,
&var_id2event_, &vec_instruction_);

// checkout ouput
for (auto& item : vec_instruction_[i].output_index_) {
Expand Down Expand Up @@ -466,7 +457,7 @@ void InterpreterCore::CheckGC(size_t instr_id,
#if defined(PADDLE_WITH_CUDA)
auto* dev_ctx = reinterpret_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
gc_event_[instr_id].Record(place, dev_ctx);
gc_event_[instr_id].Record(dev_ctx);
gc_queue_->AddTask(
[ container = garbages_.release(), event = &gc_event_[instr_id] ]() {
while (!event->Query()) {
Expand All @@ -483,7 +474,7 @@ void InterpreterCore::CheckGC(size_t instr_id,
#if defined(PADDLE_WITH_CUDA)
auto* dev_ctx = reinterpret_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
gc_event_[instr_id].Record(place, dev_ctx);
gc_event_[instr_id].Record(dev_ctx);
gc_queue_->AddTask(
[ container = garbages_.release(), event = &gc_event_[instr_id] ]() {
while (!event->Query()) {
Expand Down Expand Up @@ -857,34 +848,23 @@ void InterpreterCore::RecordEventInstruction(const Instruction& instruction,
// If InterpreterCore in on CPUPlace, do nothing.
if (platform::is_cpu_place(place_)) return;

#ifdef PADDLE_WITH_CUDA
const platform::CUDADeviceContext* dev_ctx =
reinterpret_cast<const platform::CUDADeviceContext*>(
instruction.dev_ctx_);
for (auto& event : instruction.output_events_) {
VLOG(3) << "Record event in out_var_id: " << event.var_id_;
event.event_->Record(*(dev_ctx->context()->Stream()));
event.event_->Record(instruction.dev_ctx_);
}
#endif
}

void InterpreterCore::WaitOrSync(const std::vector<EventInter>& events,
const platform::DeviceContext* dev_ctx) {
#ifdef PADDLE_WITH_CUDA
auto* cuda_dev_ctx =
reinterpret_cast<const platform::CUDADeviceContext*>(dev_ctx);

for (auto& event : events) {
if (event.is_sync_) {
VLOG(3) << "host sync wait in_var_id " << event.var_id_;
event.event_->Synchronize();
for (auto& event_iter : events) {
if (event_iter.is_sync_) {
VLOG(3) << "host sync wait in_var_id " << event_iter.var_id_;
event_iter.event_->Wait(platform::kCPU, dev_ctx);
} else {
VLOG(3) << "stream async wait in_var_id " << event.var_id_;
cuda_dev_ctx->context()->Stream()->WaitEvent(
event.event_->GetRawCudaEvent());
VLOG(3) << "stream async wait in_var_id " << event_iter.var_id_;
event_iter.event_->Wait(platform::kCUDA, dev_ctx);
}
}
#endif
}

void InterpreterCore::StreamWaitEventOrSync(const Instruction& instruction) {
Expand Down
3 changes: 1 addition & 2 deletions paddle/fluid/framework/new_executor/interpretercore.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_event.h"
#include "paddle/fluid/platform/event.h"

namespace paddle {
namespace framework {
Expand Down Expand Up @@ -101,7 +100,7 @@ class InterpreterCore {
bool is_build_;

std::vector<std::string> feed_names_;
std::map<size_t, std::shared_ptr<platform::CudaEvent>> var_id2event_;
std::map<size_t, std::shared_ptr<platform::DeviceEvent>> var_id2event_;

std::vector<paddle::platform::DeviceEvent> gc_event_;
std::unique_ptr<GarbageQueue> garbages_;
Expand Down
6 changes: 4 additions & 2 deletions paddle/fluid/framework/new_executor/new_executor_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <vector>

#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/device_event_base.h"
#include "paddle/fluid/platform/event.h"

namespace paddle {
Expand Down Expand Up @@ -56,11 +57,12 @@ struct NextInstruction {
};

struct EventInter {
explicit EventInter(size_t var_id, std::shared_ptr<platform::CudaEvent> event,
explicit EventInter(size_t var_id,
std::shared_ptr<platform::DeviceEvent> event,
bool is_sync)
: var_id_(var_id), event_(event), is_sync_(is_sync) {}
size_t var_id_;
std::shared_ptr<platform::CudaEvent> event_;
std::shared_ptr<platform::DeviceEvent> event_;
bool is_sync_;
};

Expand Down
1 change: 0 additions & 1 deletion paddle/fluid/inference/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ set(SHARED_INFERENCE_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc
${CMAKE_CURRENT_SOURCE_DIR}/utils/io_utils.cc
${mkldnn_quantizer_src_file}
${PADDLE_CUSTOM_OP_SRCS})

# shared inference library deps
Expand Down
13 changes: 11 additions & 2 deletions paddle/fluid/inference/api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,9 @@ endif(APPLE)
add_subdirectory(details)

if(WITH_MKLDNN)
set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn_quantizer.cc)
set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn_quantizer.cc)
cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder)
set(mkldnn_quantizer_src_file ${mkldnn_quantizer_src} PARENT_SCOPE)
set(mkldnn_quantizer_cfg ${mkldnn_quantizer_cfg} PARENT_SCOPE)
endif()

Expand Down Expand Up @@ -71,6 +70,16 @@ elseif (WIN32)
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
ARGS --dirname=${WORD2VEC_MODEL_DIR})
endif()

if(WITH_TESTING AND WITH_MKLDNN)
if (NOT APPLE AND NOT WIN32)
cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR})
elseif (WIN32)
cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
ARGS --dirname=${WORD2VEC_MODEL_DIR})
endif()
endif()

if(WITH_TESTING AND TEST test_api_impl)
if(NOT APPLE)
set_tests_properties(test_api_impl PROPERTIES TIMEOUT 120)
Expand Down
Loading

0 comments on commit 29c3e5b

Please sign in to comment.