Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…

… api_advanced_config
PaddlePaddle · Feb 17, 2022 · 9cf36f5 · 9cf36f5
2 parents c373856 + f29da15
commit 9cf36f5
Show file tree

Hide file tree

Showing 187 changed files with 3,175 additions and 2,565 deletions.
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220119")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220215")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()

diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -53,7 +53,6 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
   } else if (input_data.dtype == DistModelDataType::INT32) {
     input_tensor_ptr = input_tensor->mutable_data<int32_t>(dims, place);
   } else {
-    // Q(fleet exe dev): for input/output, should we support fp16
     LOG(ERROR) << "unsupported feed type " << input_data.dtype;
     return false;
   }
@@ -113,14 +112,6 @@ std::string DistModelDTypeToString(DistModelDataType dtype) {
   return "NOT SUPPORT DTYPE";
 }
 
-bool IsPPFirstStage(const DistModelConfig &config) {
-  return config.local_rank - config.mp_degree < 0;
-}
-
-bool IsPPLastStage(const DistModelConfig &config) {
-  return config.local_rank + config.mp_degree >= config.nranks;
-}
-
 class DistModelTimer {
  public:
   void tic() { tic_time = std::chrono::high_resolution_clock::now(); }
@@ -197,65 +188,34 @@ bool DistModel::PreparePlace() {
 }
 
 bool DistModel::CommInit() {
-  // NOTE (Yuang Liu): The peer endpoints will be obtained with the assumption
-  // that mp part is always on inner side and pp part is always on outer side.
-  // TODO(fleet exe dev): The peer endpoints could be configured by users.
-  PADDLE_ENFORCE_EQ(
-      config_.pp_degree * config_.mp_degree, config_.nranks,
-      platform::errors::InvalidArgument(
-          "The mp_degree multiplies pp_degree is not equal with nranks"));
   std::unique_ptr<framework::ProgramDesc> comm_init_program(
       new framework::ProgramDesc());
   framework::BlockDesc *comm_init_block = comm_init_program->MutableBlock(0);
-  if (config_.mp_degree > 1) {
-    PADDLE_ENFORCE_GE(
-        config_.mp_ring_id, 0,
-        platform::errors::InvalidArgument(
-            "mp ring id must be provided for inference under mp."));
-    VLOG(3) << "Init comm group for mp.";
+  std::vector<int64_t> &ring_ids =
+      config_.rank_to_ring_ids_[config_.local_rank];
+  int64_t order = 0;
+  std::string var_name_base = "comm_init_";
+  for (int64_t ring_id : ring_ids) {
+    VLOG(3) << "Init comm for ring id: " << ring_id;
+    int64_t ranks_in_group = config_.ring_id_to_ranks_[ring_id].size();
+    int64_t rank_in_group = 0;
+    std::vector<int64_t> &ranks = config_.ring_id_to_ranks_[ring_id];
+    for (int64_t rank : ranks) {
+      if (config_.local_rank == rank) {
+        break;
+      }
+      rank_in_group += 1;
+    }
     std::vector<std::string> peer_endpoints;
-    for (int64_t
-             idx = (config_.local_rank / config_.mp_degree) * config_.mp_degree,
-             i = 0;
-         i < config_.mp_degree; ++idx, ++i) {
-      if (config_.trainer_endpoints[idx] == config_.current_endpoint) {
+    for (int64_t rank : ranks) {
+      if (config_.local_rank == rank) {
         continue;
       }
-      peer_endpoints.emplace_back(config_.trainer_endpoints[idx]);
-    }
-    // get nranks in a mp group and inner group rank for local rank
-    int64_t mp_group_nranks = config_.nranks / config_.pp_degree;
-    int64_t mp_group_rank = config_.local_rank % config_.mp_degree;
-    InsertCommOp("mp_comm_id", mp_group_nranks, mp_group_rank, peer_endpoints,
-                 comm_init_block, config_.mp_ring_id);
-  }
-  if (config_.pp_degree > 1) {
-    VLOG(3) << "Init comm group for pp.";
-    if (!IsPPFirstStage(config_)) {
-      PADDLE_ENFORCE_EQ(config_.pp_upstream_ring_id >= 0, true,
-                        platform::errors::InvalidArgument(
-                            "pp upstream ring id must be provided for "
-                            "non-first pp stage if inference under pp."));
-      // not the first pp stage, has upstream
-      std::vector<std::string> upstream_peer_endpoints;
-      upstream_peer_endpoints.emplace_back(
-          config_.trainer_endpoints[config_.local_rank - config_.mp_degree]);
-      InsertCommOp("pp_upstream_comm_id", 2, 1, upstream_peer_endpoints,
-                   comm_init_block, config_.pp_upstream_ring_id);
-    }
-
-    if (!IsPPLastStage(config_)) {
-      PADDLE_ENFORCE_EQ(config_.pp_downstream_ring_id >= 0, true,
-                        platform::errors::InvalidArgument(
-                            "pp downstream ring id must be provided for "
-                            "non-last pp stage if inference under pp."));
-      // not the last pp stage, has downstream
-      std::vector<std::string> downstream_peer_endpoints;
-      downstream_peer_endpoints.emplace_back(
-          config_.trainer_endpoints[config_.local_rank + config_.mp_degree]);
-      InsertCommOp("pp_downstream_comm_id", 2, 0, downstream_peer_endpoints,
-                   comm_init_block, config_.pp_downstream_ring_id);
+      peer_endpoints.emplace_back(config_.trainer_endpoints[rank]);
     }
+    InsertCommOp(var_name_base + std::to_string(order), ranks_in_group,
+                 rank_in_group, peer_endpoints, comm_init_block, ring_id);
+    order += 1;
   }
   framework::NaiveExecutor e(place_);
   e.CreateVariables(*comm_init_program, 0, true, scope_.get());
@@ -409,12 +369,7 @@ bool DistModel::LoadParameters() {
 
 bool DistModel::PrepareFleetExe() {
   task_node_.reset(new TaskNode(program_.get(), config_.local_rank));
-  if (config_.local_rank - config_.mp_degree >= 0) {
-    task_node_->AddUpstreamTask(config_.local_rank - config_.mp_degree);
-  }
-  if (config_.local_rank + config_.mp_degree < config_.nranks) {
-    task_node_->AddDownstreamTask(config_.local_rank + config_.mp_degree);
-  }
+  // With auto cut, there is no concept of pp, no need to add dependency.
   task_node_->SetType("Compute");
   task_node_->Init();
   executor_desc_ = FleetExecutorDesc();
@@ -473,40 +428,13 @@ bool DistModel::PrepareFeedAndFetch() {
     }
   }
 
-  if (config_.pp_degree == 1) {
-    if (feeds_.size() == 0) {
-      LOG(ERROR) << "No feed ops in the inf program, please check the program.";
-      return false;
-    }
-    if (fetches_.size() == 0) {
-      LOG(ERROR) << "No fetch op in the inf program, please check the program.";
-      return false;
-    }
-  } else {
-    if (IsPPFirstStage(config_)) {
-      if (feeds_.size() == 0) {
-        LOG(ERROR) << "Feed ops are needed for the first pp stage.";
-        return false;
-      }
-    } else {
-      if (feeds_.size() > 0) {
-        LOG(WARNING) << "Feed op is found in the non-first stage of pp.";
-      } else {
-        LOG(INFO) << "No feed ops in non-first pp stage.";
-      }
-    }
-    if (IsPPLastStage(config_)) {
-      if (fetches_.size() == 0) {
-        LOG(WARNING) << "No fetch op was found in the last pp stage. Make sure "
-                        "the result has been sent to frist pp stage.";
-      }
-    } else {
-      if (fetches_.size() > 0) {
-        LOG(WARNING) << "Fetch op is found in the non-last stage of pp.";
-      } else {
-        LOG(INFO) << "No fetch op in non-last pp stage.";
-      }
-    }
+  if (feeds_.size() == 0) {
+    LOG(ERROR) << "No feed ops in the inf program, please check the program.";
+    return false;
+  }
+  if (fetches_.size() == 0) {
+    LOG(ERROR) << "No fetch op in the inf program, please check the program.";
+    return false;
   }
   return true;
 }
@@ -606,7 +534,6 @@ bool DistModel::FetchResult(const framework::LoDTensor &fetch,
 
 bool DistModel::Run(const std::vector<DistModelTensor> &input_data,
                     std::vector<DistModelTensor> *output_data) {
-  // TODO(fleet exe dev): support pipeline inf mode
   VLOG(3) << "DistModel run for once.";
 
   DistModelTimer timer;

diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.h b/paddle/fluid/distributed/fleet_executor/dist_model.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -47,12 +48,9 @@ struct DistModelConfig {
   std::string current_endpoint{};
   int64_t nranks{1};
   int64_t local_rank{0};
-  int64_t mp_degree{1};
-  int64_t pp_degree{1};
-  int64_t mp_ring_id{-1};
-  int64_t pp_upstream_ring_id{-1};
-  int64_t pp_downstream_ring_id{-1};
   bool enable_timer{false};
+  std::map<int64_t, std::vector<int64_t>> ring_id_to_ranks_{};
+  std::map<int64_t, std::vector<int64_t>> rank_to_ring_ids_{};
 };
 
 class DistModel {

diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -177,5 +177,5 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
 
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(elementwise_add);
-USE_OP(matmul_v2);
+USE_OP_ITSELF(matmul_v2);
 USE_OP(reduce_sum);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -186,7 +186,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
 }
 
 USE_OP_ITSELF(scale);
-USE_OP(matmul_v2);
+USE_OP_ITSELF(matmul_v2);
 USE_OP(reduce_sum);
 USE_OP(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);

diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -213,5 +213,5 @@ TEST(Benchmark, FluidMLPCPU) {
 
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(elementwise_add);
-USE_OP(matmul_v2);
+USE_OP_ITSELF(matmul_v2);
 USE_OP(reduce_sum);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -246,7 +246,7 @@ TEST(Benchmark, FluidMLPCUDA) {
 }  // namespace paddle
 
 USE_OP_ITSELF(scale);
-USE_OP(matmul_v2);
+USE_OP_ITSELF(matmul_v2);
 USE_OP(reduce_sum);
 USE_OP(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);

diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -124,4 +124,4 @@ TEST(Generated, ElementwiseAdd) {
 
 USE_OP(sigmoid);
 USE_OP_ITSELF(elementwise_add);
-USE_OP(matmul_v2);
+USE_OP_ITSELF(matmul_v2);
diff --git a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc
@@ -72,7 +72,18 @@ void AdaptivePool2dConvertGlobalPass::ApplyImpl(ir::Graph* graph) const {
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
       auto* op = n->Op();
-      if (op->HasAttr("adaptive") && op->HasAttr("ksize")) {
+      if (op->Type() == "pool2d" && op->HasAttr("adaptive") &&
+          op->HasAttr("ksize")) {
+        if (op->HasAttr("global_pooling")) {
+          bool global_pooling =
+              BOOST_GET_CONST(bool, op->GetAttr("global_pooling"));
+          if (global_pooling) return;
+        }
+        if (!op->HasAttr("pooling_type")) return;
+        std::string type =
+            BOOST_GET_CONST(std::string, op->GetAttr("pooling_type"));
+        // adaptive has no effect on max pooling
+        if (type == "max") return;
         bool adaptive = BOOST_GET_CONST(bool, op->GetAttr("adaptive"));
         std::vector<int> ksize =
             BOOST_GET_CONST(std::vector<int>, op->GetAttr("ksize"));

diff --git a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc
@@ -29,6 +29,8 @@ TEST(AdaptivePool2dConvertGlobalPass, basic) {
   AttributeMap attrs;
   attrs["adaptive"] = true;
   attrs["ksize"] = std::vector<int>{1, 1};
+  attrs["pooling_type"] =
+      std::string("avg");  // adaptive has no effect on max pooling
   layers.pool2d(x, false, &attrs);
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));

diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
@@ -157,7 +157,7 @@ ConvActivationFusePass::ConvActivationFusePass() {
       // IsStringIn({"NHWC", "NCHW"}) MobileNetV2 has no this attribute
       .AddAttr("data_format")
       .IsOptional()
-      .IsStringIn({"NCHW", "AnyLayout"})
+      .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
       .End();
 
   AddOpCompat(OpCompat("relu"))

diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -115,7 +115,7 @@ Conv2DTransposeBiasFusePass::Conv2DTransposeBiasFusePass() {
       .IsStringIn({"EXPLICIT", "SAME", "VALID"})
       .End()
       .AddAttr("data_format")
-      .IsStringIn({"NCHW"})
+      .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
       .End();
 
   AddOpCompat(OpCompat("elementwise_add"))
@@ -129,7 +129,7 @@ Conv2DTransposeBiasFusePass::Conv2DTransposeBiasFusePass() {
       .IsTensor()
       .End()
       .AddAttr("axis")
-      .IsIntIn({1})
+      .IsIntIn({1, 3})
       .End();
 }
 

diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
@@ -59,7 +59,7 @@ ConvConcatReLUFusePass::ConvConcatReLUFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("data_format")
-      .IsStringIn({"NCHW"})
+      .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
       .End();
 
   AddOpCompat(OpCompat("concat"))

diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
@@ -132,6 +132,27 @@ bool CPUQuantizeSquashPass::IsDequantizeInputUint8(
   return false;
 }
 
+bool CPUQuantizeSquashPass::IsDequantizeQuantizeIncompatible(
+    Node* quant_op, Node* dequant_in, Node* next_op) const {
+  bool is_concat_signed =
+      quant_op->Op()->GetAttrIfExists<bool>("is_negative_input");
+  bool is_input_unsigned = IsDequantizeInputUint8(dequant_in);
+  /* TODO(sfraczek): remove elementwise from this condition when BinaryMKLDNN
+   kernel will support two different input data types */
+  bool is_next_op_concat_or_elementwise =
+      next_op->Op()->Type() == "concat" ||
+      next_op->Op()->Type().find("elementwise") == 0;
+  if (is_next_op_concat_or_elementwise && is_concat_signed &&
+      is_input_unsigned) {
+    VLOG(4) << "Do not squash dequant-quant, because "
+            << "next_op is: " << next_op->Op()->Type()
+            << ", is_concat_signed: " << is_concat_signed
+            << ", is_input_unsigned: " << is_input_unsigned << ".";
+    return true;
+  }
+  return false;
+}
+
 void CPUQuantizeSquashPass::DequantQuantSquash(
     Graph* graph,
     std::unordered_map<const Node*, int>* nodes_keep_counter) const {
@@ -151,9 +172,7 @@ void CPUQuantizeSquashPass::DequantQuantSquash(
     GET_IR_NODE_FROM_SUBGRAPH(quant_out, quant_out, squash_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, squash_pattern);
 
-    // Don't squash if e.g. just one concat input is unsigned
-    if (IsDequantizeInputUint8(dequant_in) &&
-        !quant_op->Op()->GetAttrIfExists<bool>("is_negative_input")) {
+    if (IsDequantizeQuantizeIncompatible(quant_op, dequant_in, next_op)) {
       return;
     }
 

diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
@@ -48,6 +48,15 @@ class CPUQuantizeSquashPass : public FusePassBase {
    */
   bool IsDequantizeInputUint8(const Node* dequant_in) const;
 
+  /*
+   * Don't squash unsigned dequantize with signed quantize.
+   * This is important for concat and elementwise ops.
+   * When inputs have different sign, concat will assume signed type and
+   * elementwise assumes first input type.
+   */
+  bool IsDequantizeQuantizeIncompatible(Node* quant_op, Node* dequant_in,
+                                        Node* next_op) const;
+
   /*
    * Squash dequantize-quantize ops pairs into requantize or nothing
    */